diff --git a/404.html b/404.html
index 6f934732..3049caeb 100644
--- a/404.html
+++ b/404.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
diff --git a/c01-intro.html b/c01-intro.html
index b453301b..4778aa9b 100644
--- a/c01-intro.html
+++ b/c01-intro.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
diff --git a/c02-overview-surveys.html b/c02-overview-surveys.html
index ac217481..e82efa2a 100644
--- a/c02-overview-surveys.html
+++ b/c02-overview-surveys.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,15 +378,14 @@ <h1>
             <section class="normal" id="section-">
 <div id="c02-overview-surveys" class="section level1 hasAnchor" number="2">
 <h1><span class="header-section-number">Chapter 2</span> Overview of Surveys<a href="c02-overview-surveys.html#c02-overview-surveys" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<!-- author review -->
 <p>Developing surveys to gather accurate information about populations involves a more intricate and time-intensive process compared to surveys that use non-random criteria for selecting samples. Researchers can spend months, or even years, developing the study design, questions, and other methods for a single survey to ensure high-quality data is collected.</p>
 <p>While this book focuses on the analysis methods of complex surveys, understanding the entire survey life cycle can provide a better insight into what types of analyses should be conducted on the data. The <em>survey life cycle</em> consists of the stages required to successfully execute a survey project. Each stage influences the timing, costs, and feasibility of the survey, consequently impacting the data collected and how it should be analyzed.</p>
 <p>The survey life cycle starts with a <em>research topic or question of interest</em> (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs.</p>
-<p>To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail (e.g., <span class="citation">Dillman, Smyth, and Christian (<a href="#ref-dillman2014mode" role="doc-biblioref">2014</a>)</span>, <span class="citation">Groves et al. (<a href="#ref-groves2009survey" role="doc-biblioref">2009</a>)</span>, <span class="citation">(<a href="#ref-Tourangeau2000psych" role="doc-biblioref"><strong>Tourangeau2000psych?</strong></a>)</span>, <span class="citation">(<a href="#ref-Bradburn2004" role="doc-biblioref"><strong>Bradburn2004?</strong></a>)</span>, <span class="citation">Valliant, Dever, and Kreuter (<a href="#ref-valliant2013practical" role="doc-biblioref">2013</a>)</span>, and <span class="citation">Paul P. Biemer and Lyberg (<a href="#ref-biemer2003survqual" role="doc-biblioref">2003</a>)</span>).</p>
+<p>To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail <span class="citation">(e.g., <a href="#ref-dillman2014mode" role="doc-biblioref">Dillman, Smyth, and Christian 2014</a>; <a href="#ref-groves2009survey" role="doc-biblioref">Groves et al. 2009</a>; <a href="#ref-Tourangeau2000psych" role="doc-biblioref">Tourangeau, Rips, and Rasinski 2000</a>; <a href="#ref-Bradburn2004" role="doc-biblioref">Bradburn, Sudman, and Wansink 2004</a>; <a href="#ref-valliant2013practical" role="doc-biblioref">Valliant, Dever, and Kreuter 2013</a>; <a href="#ref-biemer2003survqual" role="doc-biblioref">Biemer and Lyberg 2003</a>)</span>.</p>
 <div id="pre-survey-planning" class="section level2 hasAnchor" number="2.1">
 <h2><span class="header-section-number">2.1</span> Pre-Survey Planning<a href="c02-overview-surveys.html#pre-survey-planning" class="anchor-section" aria-label="Anchor link to header"></a></h2>
 <p>When starting a survey, there are multiple things to consider. <em>Errors</em> are the differences between the true values of the variables being studied and the values obtained through the survey. Each step and decision made before the launch of the survey can impact the types of error that are introduced into the data, which in turn impact how to interpret the results.</p>
-<p>Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement (<span class="citation">Groves et al. (<a href="#ref-groves2009survey" role="doc-biblioref">2009</a>)</span>):
+<p>Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement <span class="citation">(<a href="#ref-groves2009survey" role="doc-biblioref">Groves et al. 2009</a>)</span>:
 <!--TODO: Check that we always refer to it as population of interest and not target population--></p>
 <ul>
 <li>Representation
@@ -403,7 +402,7 @@ <h2><span class="header-section-number">2.1</span> Pre-Survey Planning<a href="c
 <li><strong>Processing Error</strong>: Edits by the researcher to responses provided by the respondent (e.g., adjustments to data based on illogical responses)</li>
 </ul></li>
 </ul>
-<p>Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the <em>total survey error</em>, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results {<span class="citation">Paul P. Biemer (<a href="#ref-tse-doc" role="doc-biblioref">2010</a>)</span>}. However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money:</p>
+<p>Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the <em>total survey error</em>, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results <span class="citation">(<a href="#ref-tse-doc" role="doc-biblioref">Biemer 2010</a>)</span>. However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money:</p>
 <ul>
 <li><strong>Sampling Error Tradeoff</strong>: Researchers can increase the sample size to reduce sampling error; however, larger samples can be expensive and time-consuming to find.</li>
 <li><strong>Coverage Error Tradeoff</strong>: Researchers can search for more accurate and updated sampling frames, but they can be difficult to construct or obtain.</li>
@@ -425,7 +424,7 @@ <h3><span class="header-section-number">2.2.1</span> Sampling Design<a href="c02
 <p>Once the researchers have selected the sampling frame, the next step is determining how to select individuals for the survey. In rare cases, researchers may wish to conduct a <em>census</em> and survey everyone on the sampling frame. However, the ability to implement a questionnaire at that scale is something only some can do (e.g., government censuses). Instead, researchers choose to sample individuals and use weights to estimate numbers in the target population. There are a variety of different sampling methods that can be used, and more information on these can be found in Chapter <a href="c03-specifying-sample-designs.html#c03-specifying-sample-designs">3</a>. This decision of which sampling method to use impacts <em>sampling error</em> and can be accounted for in weighting.</p>
 <div id="overview-design-sampdesign-ex" class="section level4 unnumbered hasAnchor">
 <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#overview-design-sampdesign-ex" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>Let’s use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let’s assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households (<span class="citation">Harter et al. (<a href="#ref-harter2016address" role="doc-biblioref">2016</a>)</span>). To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state).</p>
+<p>Let’s use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let’s assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households <span class="citation">(<a href="#ref-harter2016address" role="doc-biblioref">Harter et al. 2016</a>)</span>. To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state).</p>
 <p>Throughout this chapter, we will build on this example research question to plan a survey.</p>
 </div>
 </div>
@@ -438,10 +437,10 @@ <h3><span class="header-section-number">2.2.2</span> Data Collection Planning<a
 <li>Computer Assisted Web Interview (CAWI; also known as web or on-line interviewing)</li>
 <li>Paper and Pencil Interview (PAPI)</li>
 </ul>
-<p>Researchers can use a single mode to collect data or multiple modes (also called <em>mixed modes</em>). Using mixed modes can allow for broader reach and increase response rates depending on the target population (<span class="citation">(<a href="#ref-deLeeuw2005" role="doc-biblioref"><strong>deLeeuw2005?</strong></a>)</span>, <span class="citation">DeLeeuw (<a href="#ref-DeLeeuw_2018" role="doc-biblioref">2018</a>)</span>, <span class="citation">Paul P. Biemer et al. (<a href="#ref-biemer_choiceplus" role="doc-biblioref">2017</a>)</span>). For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis.</p>
+<p>Researchers can use a single mode to collect data or multiple modes (also called <em>mixed modes</em>). Using mixed modes can allow for broader reach and increase response rates depending on the target population <span class="citation">(<a href="#ref-deLeeuw2005" role="doc-biblioref">DeLeeuw 2005</a>, <a href="#ref-DeLeeuw_2018" role="doc-biblioref">2018</a>; <a href="#ref-biemer_choiceplus" role="doc-biblioref">Biemer et al. 2017</a>)</span>. For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis.</p>
 <p>When selecting which mode, or modes, to use, understanding the unique aspects of the chosen target population and sampling frame will provide insight into how they can best be reached and engaged. For example, if we plan to survey adults aged 18-24 who live in North Carolina, asking them to complete a survey using CATI (i.e., over the phone) would most likely not be as successful as other modes like the web. This age group does not talk on the phone as much as other generations, and often do not answer their phones for unknown numbers. Additionally, the mode for contacting respondents relies on what information is available on the sampling frame. For example, if our sampling frame includes an email address, we could email our selected sample members to convince them to complete a survey. Or if the sampling frame is a list of mailing addresses, researchers would have to contact sample members with a letter.</p>
 <p>It is important to note that there can be a difference between the contact and survey modes. For example, if we have a sampling frame with addresses, we can send a letter to our sample members and provide information on how to complete a web survey. Or we could use mixed-mode surveys and send sample members a paper and pencil survey with our letter and also ask them to complete the survey online. Combining different contact modes and different survey modes can be useful in reducing <em>unit nonresponse error</em>–where the entire unit (e.g., a household) does not respond to the survey at all–as different sample members may respond better to different contact and survey modes. However, when considering which modes to use, it is important to make access to the survey as easy as possible for sample members to reduce burden and unit nonresponse.</p>
-<p>Another way to reduce unit nonresponse error is through varying the language of the contact materials (<span class="citation">Dillman, Smyth, and Christian (<a href="#ref-dillman2014mode" role="doc-biblioref">2014</a>)</span>). People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke “urgent” or “important” thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL.</p>
+<p>Another way to reduce unit nonresponse error is through varying the language of the contact materials <span class="citation">(<a href="#ref-dillman2014mode" role="doc-biblioref">Dillman, Smyth, and Christian 2014</a>)</span>. People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke “urgent” or “important” thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL.</p>
 <p>A study timeline may also determine the number and types of contacts. If the timeline is long, then there is a lot of time for follow-ups and varying the message in contact materials. If the timeline is short, then fewer follow-ups can be implemented. Many studies will start with the tailored design method put forth by <span class="citation">Dillman, Smyth, and Christian (<a href="#ref-dillman2014mode" role="doc-biblioref">2014</a>)</span> and implement 5 contacts:</p>
 <ul>
 <li>Prenotice letting sample members know the survey is coming</li>
@@ -535,7 +534,7 @@ <h3><span class="header-section-number">2.2.3</span> Questionnaire Design<a href
 <p>When developing the questionnaire, it can be helpful to first outline the topics to be asked and include the “why” each question or topic is important to the research question(s). This can help researchers better tailor the questionnaire and potentially reduce the number of questions (and thus the burden on the respondent) if topics are deemed irrelevant to the research question. When making these decisions, researchers should also consider questions needed for weighting. While we would love to have everyone sampled answer our survey, this is rarely the case. Thus, including questions about demographics in the survey can assist with weighting for <em>nonresponse errors</em> (both unit and item nonresponse). Knowing the details of the sampling plan and what may impact <em>coverage error</em> and <em>sampling error</em> can help researchers determine what types of demographics to include.</p>
 <p>Researchers can benefit from the work of others by using questions from other surveys. Demographic questions such as race, ethnicity, or education often use questions from a government census or other official surveys. Other survey questions can be found using question banks which are a compilation of questions that have been asked across various surveys such as the <a href="https://www.icpsr.umich.edu/web/pages/ICPSR/ssvd/">Inter-university Consortium for Political and Social Research (ICPSR) variable search</a>.</p>
 <p>If a question does not exist in a question bank, researchers can craft their own. When creating their own questions, researchers should start with the research question or topic and attempt to write questions that match the concept. The closer the question asked is to the overall concept, the better <em>validity</em> there is. For example, if the researcher wants to know how people consume TV series and movies but only asks a question about how many TVs are in the house, then they would be missing other ways that people watch TV series and movies, such as on other devices or at places outside of the home. As mentioned above, researchers can employ techniques to increase the validity of their questionnaire. For example, questionnaire testing involves conducting a pilot of the survey instrument to identify and fix potential issues before the main survey is conducted. Cognitive interviewing is a technique where researchers walk through the survey with participants, encouraging them to speak their thoughts out loud to uncover how they interpret and understand survey questions.</p>
-<p>Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in <em>measurement error</em>, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes (e.g., <span class="citation">Dillman, Smyth, and Christian (<a href="#ref-dillman2014mode" role="doc-biblioref">2014</a>)</span>, <span class="citation">(<a href="#ref-Fowler1989" role="doc-biblioref"><strong>Fowler1989?</strong></a>)</span>, <span class="citation">(<a href="#ref-Bradburn2004" role="doc-biblioref"><strong>Bradburn2004?</strong></a>)</span>, <span class="citation">(<a href="#ref-Tourangeau2004spacing" role="doc-biblioref"><strong>Tourangeau2004spacing?</strong></a>)</span>).</p>
+<p>Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in <em>measurement error</em>, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes <span class="citation">(e.g., <a href="#ref-dillman2014mode" role="doc-biblioref">Dillman, Smyth, and Christian 2014</a>; <a href="#ref-Fowler1989" role="doc-biblioref">Fowler and Mangione 1989</a>; <a href="#ref-Bradburn2004" role="doc-biblioref">Bradburn, Sudman, and Wansink 2004</a>; <a href="#ref-Tourangeau2004spacing" role="doc-biblioref">Tourangeau, Couper, and Conrad 2004</a>)</span>.</p>
 <div id="overview-design-questionnaire-ex" class="section level4 unnumbered hasAnchor">
 <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#overview-design-questionnaire-ex" class="anchor-section" aria-label="Anchor link to header"></a></h4>
 <p>As part of our survey on the average number of pets in a household, researchers may want to know what animal most people prefer to have as a pet. Let’s say we have the following question in our survey:</p>
@@ -569,13 +568,13 @@ <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#ove
 </ul>
 </blockquote>
 <p>Researchers can then code the responses from the open-ended box and get a better understanding of the respondent’s choice of preferred pet. Interpreting this question becomes easier as researchers no longer need to qualify the results with the choices provided.</p>
-<p>This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter @(c04-understanding-survey-data-documentation) provides further details on how to review existing survey documentation to inform our analyses.</p>
+<p>This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a> provides further details on how to review existing survey documentation to inform our analyses.</p>
 </div>
 </div>
 </div>
 <div id="overview-datacollection" class="section level2 hasAnchor" number="2.3">
 <h2><span class="header-section-number">2.3</span> Data Collection<a href="c02-overview-surveys.html#overview-datacollection" class="anchor-section" aria-label="Anchor link to header"></a></h2>
-<p>Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection (<span class="citation">(<a href="#ref-Schouten2018" role="doc-biblioref"><strong>Schouten2018?</strong></a>)</span>). Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group.</p>
+<p>Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection <span class="citation">(<a href="#ref-Schouten2018" role="doc-biblioref">Schouten, Peytchev, and Wagner 2018</a>)</span>. Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group.</p>
 </div>
 <div id="overview-post" class="section level2 hasAnchor" number="2.4">
 <h2><span class="header-section-number">2.4</span> Post-Survey Processing<a href="c02-overview-surveys.html#overview-post" class="anchor-section" aria-label="Anchor link to header"></a></h2>
@@ -584,7 +583,7 @@ <h2><span class="header-section-number">2.4</span> Post-Survey Processing<a href
 <h3><span class="header-section-number">2.4.1</span> Data Cleaning and Imputation<a href="c02-overview-surveys.html#overview-post-cleaning" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 <p>Post-survey cleaning and <em>imputation</em> is one of the first steps researchers will do to get the survey responses into a dataset for use by analysts. Data cleaning can consist of cleaning inconsistent data (e.g., with skip pattern errors or multiple questions throughout the survey being consistent with each other), editing numeric entries or open-ended responses for grammar and consistency, or recoding open-ended questions into categories for analysis. There is no universal set of fixed rules that every project must adhere to. Instead, each project or research study should establish its own guidelines and procedures for handling various cleaning scenarios based on its specific objectives.</p>
 <p>Researchers should use their best judgment to ensure data integrity, and all decisions should be documented and available to those using the data in the analysis. Each decision a researcher makes impacts <em>processing error</em>, so often researchers will have multiple people review these rules or recode open-ended data and adjudicate any differences in an attempt to reduce this error.</p>
-<p>Another crucial step in post-survey processing is <em>imputation</em>. Often, there is item nonresponse where respondents do not answer specific questions. If the questions are crucial to analysis efforts or the research question, researchers may implement imputation in an effort to reduce <em>item nonresponse error</em>. Imputation is a technique for replacing missing or incomplete data values with estimated values. However, as imputation is a way of assigning a value to missing data based on an algorithm or model, it can also introduce <em>processing error</em>, so researchers should consider the overall implications of imputing data compared to having item nonresponse. There are multiple ways imputation can be conducted. We recommend reviewing other resources like <span class="citation">(<a href="#ref-Kim2021" role="doc-biblioref"><strong>Kim2021?</strong></a>)</span> for more information.</p>
+<p>Another crucial step in post-survey processing is <em>imputation</em>. Often, there is item nonresponse where respondents do not answer specific questions. If the questions are crucial to analysis efforts or the research question, researchers may implement imputation in an effort to reduce <em>item nonresponse error</em>. Imputation is a technique for replacing missing or incomplete data values with estimated values. However, as imputation is a way of assigning a value to missing data based on an algorithm or model, it can also introduce <em>processing error</em>, so researchers should consider the overall implications of imputing data compared to having item nonresponse. There are multiple ways imputation can be conducted. We recommend reviewing other resources like <span class="citation">Kim and Shao (<a href="#ref-Kim2021" role="doc-biblioref">2021</a>)</span> for more information.</p>
 <div id="overview-post-cleaning-ex" class="section level4 unnumbered hasAnchor">
 <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#overview-post-cleaning-ex" class="anchor-section" aria-label="Anchor link to header"></a></h4>
 <p>Let’s return to the question we created to ask about <a href="c02-overview-surveys.html#overview-design-questionnaire-ex">animal preference</a>. The “other specify” invites respondents to specify the type of animal they prefer to have as a pet. If respondents entered answers such as “puppy,” “turtle,” “rabit,” “rabbit,” “bunny,” “ant farm,” “snake,” “Mr. Purr,” then researchers may wish to categorize these write-in responses to help with analysis. In this example, “puppy” could be assumed to be a reference to a “Dog”, and could be recoded there. The misspelling of “rabit” could be coded along with “rabbit” and “bunny” into a single category of “Bunny or Rabbit”. These are relatively standard decisions that a researcher could make. The remaining write-in responses could be categorized in a few different ways. “Mr. Purr,” which may be someone’s reference to their own cat, could be recoded as “Cat”, or it could remain as “Other” or some category that is “Unknown”. Depending on the number of responses related to each of the others, they could all be combined into a single “Other” category, or maybe categories such as “Reptiles” or “Insects” could be created. Each of these decisions may impact the interpretation of the data, so our researcher should document the types of responses that fall into each of the new categories and any decisions made.</p>
@@ -592,7 +591,7 @@ <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#ove
 </div>
 <div id="overview-post-weighting" class="section level3 hasAnchor" number="2.4.2">
 <h3><span class="header-section-number">2.4.2</span> Weighting<a href="c02-overview-surveys.html#overview-post-weighting" class="anchor-section" aria-label="Anchor link to header"></a></h3>
-<p>Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an “analysis weight” variable that combines these adjustments. However, weighting itself can also introduce <em>adjustment error</em>, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own (<span class="citation">(<a href="#ref-Valliant2018weights" role="doc-biblioref"><strong>Valliant2018weights?</strong></a>)</span>). Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a>) and work with the data and analysis weights provided to analyze and interpret survey results correctly.</p>
+<p>Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an “analysis weight” variable that combines these adjustments. However, weighting itself can also introduce <em>adjustment error</em>, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own <span class="citation">(<a href="#ref-Valliant2018weights" role="doc-biblioref">Valliant and Dever 2018</a>)</span>. Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a>) and work with the data and analysis weights provided to analyze and interpret survey results correctly.</p>
 <div id="overview-post-weighting-ex" class="section level4 unnumbered hasAnchor">
 <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#overview-post-weighting-ex" class="anchor-section" aria-label="Anchor link to header"></a></h4>
 <p>In the simple example of our survey, we decided to use a stratified sample by state to select our sample members. Knowing this sampling design, our researcher can include selection weights for analysis that account for how the sample members were selected for the survey. Additionally, the sampling frame may have the type of building associated with each address, so we could include the building type as a potential nonresponse weighting variable, along with some interviewer observations that may be related to our research topic of the average number of pets in a household. Combining these weights, we can create an analytic weight that researchers will need to use when analyzing the data.</p>
@@ -600,13 +599,13 @@ <h4>Example: Number of Pets in a Household<a href="c02-overview-surveys.html#ove
 </div>
 <div id="overview-post-disclosure" class="section level3 hasAnchor" number="2.4.3">
 <h3><span class="header-section-number">2.4.3</span> Disclosure<a href="c02-overview-surveys.html#overview-post-disclosure" class="anchor-section" aria-label="Anchor link to header"></a></h3>
-<p>Before data is made publicly available, researchers will need to ensure that individual respondents can not be identified by the data when confidentiality is required. There are a variety of different methods that can be used, including <em>data swapping</em>, <em>top or bottom coding</em>, <em>coarsening</em>, and <em>perturbation.</em> In data swapping, researchers may swap specific data values across different respondents so that it does not impact insights from the data but ensures that specific individuals cannot be identified. For extreme values, top and bottom coding is sometimes used. For example, researchers may top-code income values such that households with income greater than $99,999,999 are coded into a single category of $99,999,999 or more. Other disclosure methods may include aggregating response categories or location information to avoid having only a few respondents in a given group and thus be identified. For example, researchers may use coarsening to display income in categories instead of as a continuous variable. Data producers may also perturb the data by adding random noise. There is as much art as there is a science to the methods used for disclosure, and in documentation, researchers should only provide high-level comments that disclosure was conducted and not specific details to ensure nobody can reverse the disclosure and thus identify individuals. For more information on different disclosure methods, please see <span class="citation">(<a href="#ref-Skinner2009" role="doc-biblioref"><strong>Skinner2009?</strong></a>)</span> and
-AAPOR Standards<a href="#fn4" class="footnote-ref" id="fnref4"><sup>4</sup></a>.</p>
+<p>Before data is made publicly available, researchers will need to ensure that individual respondents can not be identified by the data when confidentiality is required. There are a variety of different methods that can be used, including <em>data swapping</em>, <em>top or bottom coding</em>, <em>coarsening</em>, and <em>perturbation.</em> In data swapping, researchers may swap specific data values across different respondents so that it does not impact insights from the data but ensures that specific individuals cannot be identified. For extreme values, top and bottom coding is sometimes used. For example, researchers may top-code income values such that households with income greater than $99,999,999 are coded into a single category of $99,999,999 or more. Other disclosure methods may include aggregating response categories or location information to avoid having only a few respondents in a given group and thus be identified. For example, researchers may use coarsening to display income in categories instead of as a continuous variable. Data producers may also perturb the data by adding random noise. There is as much art as there is a science to the methods used for disclosure, and in documentation, researchers should only provide high-level comments that disclosure was conducted and not specific details to ensure nobody can reverse the disclosure and thus identify individuals. For more information on different disclosure methods, please see <span class="citation">Skinner (<a href="#ref-Skinner2009" role="doc-biblioref">2009</a>)</span> and
+<a href="https://www-archive.aapor.org/Standards-Ethics/AAPOR-Code-of-Ethics/Survey-Disclosure-Checklist.aspx">AAPOR Standards</a>.</p>
 </div>
 <div id="overview-post-documentation" class="section level3 hasAnchor" number="2.4.4">
 <h3><span class="header-section-number">2.4.4</span> Documentation<a href="c02-overview-surveys.html#overview-post-documentation" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 <p>Documentation is a critical step of the survey life cycle. Researchers systematically record all the details, decisions, procedures, and methodologies to ensure transparency, reproducibility, and the overall quality of survey research.</p>
-<p>Proper documentation allows analysts to understand, reproduce, and evaluate the study’s methods and findings. Chapter @(c04-understanding-survey-data-documentation) dives into how analysts should use survey data documentation.</p>
+<p>Proper documentation allows analysts to understand, reproduce, and evaluate the study’s methods and findings. Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a> dives into how analysts should use survey data documentation.</p>
 </div>
 </div>
 <div id="post-survey-data-analysis-and-reporting" class="section level2 hasAnchor" number="2.5">
@@ -618,26 +617,53 @@ <h2><span class="header-section-number">2.5</span> Post-survey data analysis and
 <h3>References<a href="references.html#references" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 <div id="refs" class="references csl-bib-body hanging-indent">
 <div id="ref-tse-doc" class="csl-entry">
-Biemer, Paul P. 2010. <span>“<span class="nocase">Total Survey Error: Design, Implementation, and Evaluation</span>.”</span> <em>Public Opinion Quarterly</em> 74 (5): 817–48. <a href="https://doi.org/10.1093/poq/nfq058">https://doi.org/10.1093/poq/nfq058</a>.
+Biemer, Paul P. 2010. <span>“Total Survey Error: Design, Implementation, and Evaluation.”</span> <em>Public Opinion Quarterly</em> 74 (5): 817–48. <a href="https://doi.org/10.1093/poq/nfq058">https://doi.org/10.1093/poq/nfq058</a>.
 </div>
 <div id="ref-biemer2003survqual" class="csl-entry">
-Biemer, Paul P, and Lars E Lyberg. 2003. <em>Introduction to Survey Quality</em>. John Wiley &amp; Sons.
+Biemer, Paul P., and Lars E. Lyberg. 2003. <em>Introduction to Survey Quality</em>. John Wiley &amp; Sons.
 </div>
 <div id="ref-biemer_choiceplus" class="csl-entry">
-Biemer, Paul P, Joe Murphy, Stephanie Zimmer, Chip Berry, Grace Deng, and Katie Lewis. 2017. <span>“<span class="nocase">Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys</span>.”</span> <em>Journal of Survey Statistics and Methodology</em> 6 (2): 240–61. <a href="https://doi.org/10.1093/jssam/smx015">https://doi.org/10.1093/jssam/smx015</a>.
+Biemer, Paul P., Joe Murphy, Stephanie Zimmer, Chip Berry, Grace Deng, and Katie Lewis. 2017. <span>“<span class="nocase">Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys</span>.”</span> <em>Journal of Survey Statistics and Methodology</em> 6 (2): 240–61. <a href="https://doi.org/10.1093/jssam/smx015">https://doi.org/10.1093/jssam/smx015</a>.
+</div>
+<div id="ref-Bradburn2004" class="csl-entry">
+Bradburn, Norman M., Seymour Sudman, and Brian Wansink. 2004. <em>Asking Questions: The Definitive Guide to Questionnaire Design</em>. 2nd Edition. Jossey-Bass.
+</div>
+<div id="ref-deLeeuw2005" class="csl-entry">
+DeLeeuw, Edith D. 2005. <span>“To Mix or Not to Mix Data Collection Modes in Surveys.”</span> <em>Journal of Official Statistics</em> 21: 233–55.
 </div>
 <div id="ref-DeLeeuw_2018" class="csl-entry">
-DeLeeuw, Edith D. 2018. <span>“Mixed-Mode: Past, Present, and Future.”</span> <em>Survey Research Methods</em> 12 (2): 75–89. <a href="https://doi.org/10.18148/srm/2018.v12i2.7402">https://doi.org/10.18148/srm/2018.v12i2.7402</a>.
+———. 2018. <span>“Mixed-Mode: Past, Present, and Future.”</span> <em>Survey Research Methods</em> 12 (2): 75–89. <a href="https://doi.org/10.18148/srm/2018.v12i2.7402">https://doi.org/10.18148/srm/2018.v12i2.7402</a>.
 </div>
 <div id="ref-dillman2014mode" class="csl-entry">
 Dillman, Don A, Jolene D Smyth, and Leah Melani Christian. 2014. <em>Internet, Phone, Mail, and Mixed-Mode Surveys: The Tailored Design Method</em>. John Wiley &amp; Sons.
 </div>
+<div id="ref-Fowler1989" class="csl-entry">
+Fowler, Floyd J, and Thomas W. Mangione. 1989. <em>Standardized Survey Interviewing</em>. SAGE.
+</div>
 <div id="ref-groves2009survey" class="csl-entry">
 Groves, Robert M, Floyd J Fowler Jr, Mick P Couper, James M Lepkowski, Eleanor Singer, and Roger Tourangeau. 2009. <em>Survey Methodology</em>. John Wiley &amp; Sons.
 </div>
 <div id="ref-harter2016address" class="csl-entry">
 Harter, Rachel, Michael P Battaglia, Trent D Buskirk, Don A Dillman, Ned English, Mansour Fahimi, Martin R Frankel, et al. 2016. <span>“Address-Based Sampling.”</span> Task force report. American Association for Public Opinion Research. <a href="https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf">https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf</a>.
 </div>
+<div id="ref-Kim2021" class="csl-entry">
+Kim, Jae Kwang, and Jun Shao. 2021. <em>Statistical Methods for Handling Incomplete Data</em>. Chapman &amp; Hall/CRC Press.
+</div>
+<div id="ref-Schouten2018" class="csl-entry">
+Schouten, Barry, Andy Peytchev, and James Wagner. 2018. <em>Adaptive Survey Design</em>. Chapman &amp; Hall/CRC Press.
+</div>
+<div id="ref-Skinner2009" class="csl-entry">
+Skinner, Chris. 2009. <span>“Chapter 15: Statistical Disclosure Control for Survey Data.”</span> In <em>Handbook of Statistics: Sample Surveys: Design, Methods and Applications</em>, edited by C. R. Rao, 381–96. Elsevier B.V.
+</div>
+<div id="ref-Tourangeau2004spacing" class="csl-entry">
+Tourangeau, Roger, Mick P. Couper, and Frederick Conrad. 2004. <span>“Sapcing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions.”</span> <em>Public Opinion Quarterly</em> 68: 368–93. <a href="http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp">http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp</a>.
+</div>
+<div id="ref-Tourangeau2000psych" class="csl-entry">
+Tourangeau, Roger, Lance J. Rips, and Kenneth Rasinski. 2000. <em>Psychology of Survey Response</em>. Cambridge University Press.
+</div>
+<div id="ref-Valliant2018weights" class="csl-entry">
+Valliant, Richard, and Jill A. Dever. 2018. <em>Survey Weights: A Step-by-Step Guide to Calculation</em>. Stata Press.
+</div>
 <div id="ref-valliant2013practical" class="csl-entry">
 Valliant, Richard, Jill A Dever, and Frauke Kreuter. 2013. <em>Practical Tools for Designing and Weighting Survey Samples</em>. Vol. 1. Springer.
 </div>
@@ -646,7 +672,6 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 <hr />
 <ol start="3">
 <li id="fn3"><p>Other modes such as using mobile apps or text messaging can also be considered, but at the time of publication, have smaller reach or are better for longitudinal studies (i.e., surveying the same individuals over many time periods of a single study)<a href="c02-overview-surveys.html#fnref3" class="footnote-back">↩︎</a></p></li>
-<li id="fn4"><p><a href="https://www-archive.aapor.org/Standards-Ethics/AAPOR-Code-of-Ethics/Survey-Disclosure-Checklist.aspx" class="uri">https://www-archive.aapor.org/Standards-Ethics/AAPOR-Code-of-Ethics/Survey-Disclosure-Checklist.aspx</a><a href="c02-overview-surveys.html#fnref4" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c03-specifying-sample-designs.html b/c03-specifying-sample-designs.html
index d8e5c27d..683c531c 100644
--- a/c03-specifying-sample-designs.html
+++ b/c03-specifying-sample-designs.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c03-specifying-sample-designs" class="section level1 hasAnchor" number="3">
 <h1><span class="header-section-number">Chapter 3</span> Specifying sample designs and replicate weights in {srvyr}<a href="c03-specifying-sample-designs.html#c03-specifying-sample-designs" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq3" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq3" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -392,7 +392,7 @@ <h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anch
 <p>To help explain the different types of sample designs, this chapter will use the <code>api</code> and <code>scd</code> data that comes in the {survey} package:</p>
 <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="c03-specifying-sample-designs.html#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">data</span>(api)</span>
 <span id="cb2-2"><a href="c03-specifying-sample-designs.html#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">data</span>(scd)</span></code></pre></div>
-<p>Additionally, we have created multiple analytic datasets for use in this book on a directory on OSF<a href="#fn5" class="footnote-ref" id="fnref5"><sup>5</sup></a>. To load any data used in the book that is not included in existing packages, we have created a helper function <code>read_osf()</code>. This chapter uses data from the Residential Energy Consumption Survey (RECS), so we will use the following code to load the RECS data to use later in this chapter:</p>
+<p>Additionally, we have created multiple analytic datasets for use in this book on a directory on OSF<a href="#fn4" class="footnote-ref" id="fnref4"><sup>4</sup></a>. To load any data used in the book that is not included in existing packages, we have created a helper function <code>read_osf()</code>. This chapter uses data from the Residential Energy Consumption Survey (RECS), so we will use the following code to load the RECS data to use later in this chapter:</p>
 <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="c03-specifying-sample-designs.html#cb3-1" aria-hidden="true" tabindex="-1"></a>recs_in <span class="ot">&lt;-</span> <span class="fu">read_osf</span>(<span class="st">&quot;recs_2015.rds&quot;</span>)</span></code></pre></div>
 </div>
 <div id="introduction" class="section level2 hasAnchor" number="3.1">
@@ -841,7 +841,7 @@ <h4>The math<a href="c03-specifying-sample-designs.html#the-math-4" class="ancho
 </div>
 <div id="the-syntax-4" class="section level4 unnumbered hasAnchor">
 <h4>The syntax<a href="c03-specifying-sample-designs.html#the-syntax-4" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>Replicate weights generally come in groups and are sequentially numbered, such as PWGTP1, PWGTP2, …, PWGTP80 for the person weights in the American Community Survey (ACS) <span class="citation">(<a href="#ref-acs-pums-2021" role="doc-biblioref">U.S. Census Bureau 2021</a>)</span> or BRRWT1, BRRWT2, …, BRRWT96 in the 2015 Residential Energy Consumption Survey (RECS) <span class="citation">(<a href="#ref-recs-2015-micro" role="doc-biblioref">U.S. Energy Information Administration 2017</a>)</span>. This makes it easy to use some of the tidy selection<a href="#fn6" class="footnote-ref" id="fnref6"><sup>6</sup></a> functions in R. For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated WT1, WT2, …, WT20, we can use the following syntax (both are equivalent):</p>
+<p>Replicate weights generally come in groups and are sequentially numbered, such as PWGTP1, PWGTP2, …, PWGTP80 for the person weights in the American Community Survey (ACS) <span class="citation">(<a href="#ref-acs-pums-2021" role="doc-biblioref">U.S. Census Bureau 2021</a>)</span> or BRRWT1, BRRWT2, …, BRRWT96 in the 2015 Residential Energy Consumption Survey (RECS) <span class="citation">(<a href="#ref-recs-2015-micro" role="doc-biblioref">U.S. Energy Information Administration 2017</a>)</span>. This makes it easy to use some of the tidy selection<a href="#fn5" class="footnote-ref" id="fnref5"><sup>5</sup></a> functions in R. For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated WT1, WT2, …, WT20, we can use the following syntax (both are equivalent):</p>
 <div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="c03-specifying-sample-designs.html#cb39-1" aria-hidden="true" tabindex="-1"></a>brr_des <span class="ot">&lt;-</span> dat <span class="sc">%&gt;%</span></span>
 <span id="cb39-2"><a href="c03-specifying-sample-designs.html#cb39-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">as_survey_rep</span>(</span>
 <span id="cb39-3"><a href="c03-specifying-sample-designs.html#cb39-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">weights =</span> WT0,</span>
@@ -1025,7 +1025,7 @@ <h4>The syntax<a href="c03-specifying-sample-designs.html#the-syntax-6" class="a
 </div>
 <div id="example-6" class="section level4 unnumbered hasAnchor">
 <h4>Example<a href="c03-specifying-sample-designs.html#example-6" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>The American Community Survey releases public use microdata with JK1 weights at the person and household level. This example includes data at the household level where the replicate weights are specified as WGTP1, …, WGTP80, and the main weight is WGTP <span class="citation">(<a href="#ref-acs-5yr-doc" role="doc-biblioref">U.S. Census Bureau 2023</a>)</span>. Using the {tidycensus} package<a href="#fn7" class="footnote-ref" id="fnref7"><sup>7</sup></a>, data is downloaded from the Census API. For example, the code below has a request to obtain data for each person in each household in two Public Use Microdata Areas (PUMAs) in Durham County, NC<a href="#fn8" class="footnote-ref" id="fnref8"><sup>8</sup></a>. The variables requested are NP (number of persons in the household), BDSP (number of bedrooms), HINCP (household income), and TYPEHUGQ (type of household). By default, several other variables will come along, including SERIALNO (a unique identifier for each household), SPORDER (a unique identifier for each person within each household), PUMA, ST (state), person weight (PWGTP), and the household weights (WGTP, WGTP1, …, WGTP80). Filtering to records where SPORDER=1 yields only one record per household and TYPEHUGQ=1 filters to only households and not group quarters.</p>
+<p>The American Community Survey releases public use microdata with JK1 weights at the person and household level. This example includes data at the household level where the replicate weights are specified as WGTP1, …, WGTP80, and the main weight is WGTP <span class="citation">(<a href="#ref-acs-5yr-doc" role="doc-biblioref">U.S. Census Bureau 2023</a>)</span>. Using the {tidycensus} package<a href="#fn6" class="footnote-ref" id="fnref6"><sup>6</sup></a>, data is downloaded from the Census API. For example, the code below has a request to obtain data for each person in each household in two Public Use Microdata Areas (PUMAs) in Durham County, NC<a href="#fn7" class="footnote-ref" id="fnref7"><sup>7</sup></a>. The variables requested are NP (number of persons in the household), BDSP (number of bedrooms), HINCP (household income), and TYPEHUGQ (type of household). By default, several other variables will come along, including SERIALNO (a unique identifier for each household), SPORDER (a unique identifier for each person within each household), PUMA, ST (state), person weight (PWGTP), and the household weights (WGTP, WGTP1, …, WGTP80). Filtering to records where SPORDER=1 yields only one record per household and TYPEHUGQ=1 filters to only households and not group quarters.</p>
 <div class="sourceCode" id="cb56"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb56-1"><a href="c03-specifying-sample-designs.html#cb56-1" aria-hidden="true" tabindex="-1"></a>pums_in <span class="ot">&lt;-</span> <span class="fu">get_pums</span>(</span>
 <span id="cb56-2"><a href="c03-specifying-sample-designs.html#cb56-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">variables =</span> <span class="fu">c</span>(<span class="st">&quot;NP&quot;</span>, <span class="st">&quot;BDSP&quot;</span>, <span class="st">&quot;HINCP&quot;</span>),</span>
 <span id="cb56-3"><a href="c03-specifying-sample-designs.html#cb56-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">state =</span> <span class="st">&quot;37&quot;</span>, </span>
@@ -1261,7 +1261,7 @@ <h2><span class="header-section-number">3.4</span> Understanding survey design d
 <p>A common method of sampling is to stratify PSUs, select PSUs within the stratum using PPS selection, and then select units within the PSUs either with SRS or PPS. Reading survey documentation is an important first step in survey analysis to understand the design of the survey you are using and variables necessary to specify the design. Good documentation will highlight the variables necessary to specify the design. This is often found in User’s Guides, methodology, analysis guides, or technical documentation (see Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a> for more details).</p>
 <div id="example-8" class="section level4 unnumbered hasAnchor">
 <h4>Example<a href="c03-specifying-sample-designs.html#example-8" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>For example, the 2017-2019 National Survey of Family Growth (NSFG)<a href="#fn9" class="footnote-ref" id="fnref9"><sup>9</sup></a> had a stratified multi-stage area probability sample. In the first stage, PSUs are counties or collections of counties and are stratified by Census region/division, size (population), and MSA status. Within each stratum, PSUs were selected via PPS. In the second stage, neighborhoods were selected within the sampled PSUs using PPS selection. In the third stage, housing units were selected within the sampled neighborhoods. In the fourth stage, a person was randomly chosen within the selected housing units among eligible persons using unequal probabilities based on the person’s age and sex. The public use file does not include all these levels of selection and instead has pseudo-strata and pseudo-clusters, which are the variables used in R to specify the design. As specified on page 4 of the documentation, the stratum variable is <code>SEST</code>, the cluster variable is <code>SECU</code>, and the weight variable is <code>WGT2017_2019</code>. Thus, to specify this design in R, use the following syntax:</p>
+<p>For example, the 2017-2019 National Survey of Family Growth (NSFG)<a href="#fn8" class="footnote-ref" id="fnref8"><sup>8</sup></a> had a stratified multi-stage area probability sample. In the first stage, PSUs are counties or collections of counties and are stratified by Census region/division, size (population), and MSA status. Within each stratum, PSUs were selected via PPS. In the second stage, neighborhoods were selected within the sampled PSUs using PPS selection. In the third stage, housing units were selected within the sampled neighborhoods. In the fourth stage, a person was randomly chosen within the selected housing units among eligible persons using unequal probabilities based on the person’s age and sex. The public use file does not include all these levels of selection and instead has pseudo-strata and pseudo-clusters, which are the variables used in R to specify the design. As specified on page 4 of the documentation, the stratum variable is <code>SEST</code>, the cluster variable is <code>SECU</code>, and the weight variable is <code>WGT2017_2019</code>. Thus, to specify this design in R, use the following syntax:</p>
 <div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb71-1"><a href="c03-specifying-sample-designs.html#cb71-1" aria-hidden="true" tabindex="-1"></a>nsfg_des <span class="ot">&lt;-</span> nsfgdata <span class="sc">%&gt;%</span></span>
 <span id="cb71-2"><a href="c03-specifying-sample-designs.html#cb71-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">as_survey_design</span>(<span class="at">ids =</span> SECU,</span>
 <span id="cb71-3"><a href="c03-specifying-sample-designs.html#cb71-3" aria-hidden="true" tabindex="-1"></a>                   <span class="at">strata =</span> SEST,</span>
@@ -1272,12 +1272,12 @@ <h4>Example<a href="c03-specifying-sample-designs.html#example-8" class="anchor-
 <h2><span class="header-section-number">3.5</span> Exercises<a href="c03-specifying-sample-designs.html#exercises" class="anchor-section" aria-label="Anchor link to header"></a></h2>
 <!-- For this chapter, the exercises entail reading public documentation to determine how to specify the survey design. While reading the documentation, be on the lookout for description of the weights and the survey design variables or replicate weights. -->
 <ol style="list-style-type: decimal">
-<li>The American National Election Studies (ANES) collect data before and after elections approximately every four years around the presidential election cycle. Each year with the data release, a user’s guide is also released<a href="#fn10" class="footnote-ref" id="fnref10"><sup>10</sup></a>. What is the syntax for specifying the analysis of the full sample post-election data?</li>
+<li>The American National Election Studies (ANES) collect data before and after elections approximately every four years around the presidential election cycle. Each year with the data release, a user’s guide is also released<a href="#fn9" class="footnote-ref" id="fnref9"><sup>9</sup></a>. What is the syntax for specifying the analysis of the full sample post-election data?</li>
 </ol>
 <div class="sourceCode" id="cb72"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb72-1"><a href="c03-specifying-sample-designs.html#cb72-1" aria-hidden="true" tabindex="-1"></a>anes_des <span class="ot">&lt;-</span> anes_data <span class="sc">%&gt;%</span></span>
 <span id="cb72-2"><a href="c03-specifying-sample-designs.html#cb72-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">as_survey_design</span>(weight)</span></code></pre></div>
 <ol start="2" style="list-style-type: decimal">
-<li>The General Social Survey is a survey that has been administered since 1972 on social, behavioral, and attitudinal topics. The 2016-2020 GSS Panel codebook<a href="#fn11" class="footnote-ref" id="fnref11"><sup>11</sup></a> provides examples of setting up syntax in SAS and Stata but not R. How would you specify the design in R?</li>
+<li>The General Social Survey is a survey that has been administered since 1972 on social, behavioral, and attitudinal topics. The 2016-2020 GSS Panel codebook<a href="#fn10" class="footnote-ref" id="fnref10"><sup>10</sup></a> provides examples of setting up syntax in SAS and Stata but not R. How would you specify the design in R?</li>
 </ol>
 <div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb73-1"><a href="c03-specifying-sample-designs.html#cb73-1" aria-hidden="true" tabindex="-1"></a>gss_des <span class="ot">&lt;-</span> gss_data <span class="sc">%&gt;%</span></span>
 <span id="cb73-2"><a href="c03-specifying-sample-designs.html#cb73-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">as_survey_design</span>(<span class="at">ids =</span> VPSU_2,</span>
@@ -1324,14 +1324,14 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="5">
-<li id="fn5"><p><a href="https://osf.io/gzbkn/?view_only=8ca80573293b4e12b7f934a0f742b957" class="uri">https://osf.io/gzbkn/?view_only=8ca80573293b4e12b7f934a0f742b957</a><a href="c03-specifying-sample-designs.html#fnref5" class="footnote-back">↩︎</a></p></li>
-<li id="fn6"><p>dplyr documentation on tidy-select: <a href="https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html" class="uri">https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html</a><a href="c03-specifying-sample-designs.html#fnref6" class="footnote-back">↩︎</a></p></li>
-<li id="fn7"><p>tidycensus package: <a href="https://walker-data.com/tidycensus/" class="uri">https://walker-data.com/tidycensus/</a><a href="c03-specifying-sample-designs.html#fnref7" class="footnote-back">↩︎</a></p></li>
-<li id="fn8"><p>Public Use Microdata Areas in North Carolina: <a href="https://www.census.gov/geographies/reference-maps/2010/geo/2010-pumas/north-carolina.html" class="uri">https://www.census.gov/geographies/reference-maps/2010/geo/2010-pumas/north-carolina.html</a><a href="c03-specifying-sample-designs.html#fnref8" class="footnote-back">↩︎</a></p></li>
-<li id="fn9"><p>2017-2019 National Survey of Family Growth (NSFG): Sample Design Documentation - <a href="https://www.cdc.gov/nchs/data/nsfg/NSFG-2017-2019-Sample-Design-Documentation-508.pdf" class="uri">https://www.cdc.gov/nchs/data/nsfg/NSFG-2017-2019-Sample-Design-Documentation-508.pdf</a><a href="c03-specifying-sample-designs.html#fnref9" class="footnote-back">↩︎</a></p></li>
-<li id="fn10"><p>ANES 2020 User’s Guide: <a href="https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf" class="uri">https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf</a><a href="c03-specifying-sample-designs.html#fnref10" class="footnote-back">↩︎</a></p></li>
-<li id="fn11"><p>2016-2020 GSS Panel Codebook Release 1a: <a href="https://gss.norc.org/Documents/codebook/2016-2020%20GSS%20Panel%20Codebook%20-%20R1a.pdf" class="uri">https://gss.norc.org/Documents/codebook/2016-2020%20GSS%20Panel%20Codebook%20-%20R1a.pdf</a><a href="c03-specifying-sample-designs.html#fnref11" class="footnote-back">↩︎</a></p></li>
+<ol start="4">
+<li id="fn4"><p><a href="https://osf.io/gzbkn/?view_only=8ca80573293b4e12b7f934a0f742b957" class="uri">https://osf.io/gzbkn/?view_only=8ca80573293b4e12b7f934a0f742b957</a><a href="c03-specifying-sample-designs.html#fnref4" class="footnote-back">↩︎</a></p></li>
+<li id="fn5"><p>dplyr documentation on tidy-select: <a href="https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html" class="uri">https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html</a><a href="c03-specifying-sample-designs.html#fnref5" class="footnote-back">↩︎</a></p></li>
+<li id="fn6"><p>tidycensus package: <a href="https://walker-data.com/tidycensus/" class="uri">https://walker-data.com/tidycensus/</a><a href="c03-specifying-sample-designs.html#fnref6" class="footnote-back">↩︎</a></p></li>
+<li id="fn7"><p>Public Use Microdata Areas in North Carolina: <a href="https://www.census.gov/geographies/reference-maps/2010/geo/2010-pumas/north-carolina.html" class="uri">https://www.census.gov/geographies/reference-maps/2010/geo/2010-pumas/north-carolina.html</a><a href="c03-specifying-sample-designs.html#fnref7" class="footnote-back">↩︎</a></p></li>
+<li id="fn8"><p>2017-2019 National Survey of Family Growth (NSFG): Sample Design Documentation - <a href="https://www.cdc.gov/nchs/data/nsfg/NSFG-2017-2019-Sample-Design-Documentation-508.pdf" class="uri">https://www.cdc.gov/nchs/data/nsfg/NSFG-2017-2019-Sample-Design-Documentation-508.pdf</a><a href="c03-specifying-sample-designs.html#fnref8" class="footnote-back">↩︎</a></p></li>
+<li id="fn9"><p>ANES 2020 User’s Guide: <a href="https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf" class="uri">https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf</a><a href="c03-specifying-sample-designs.html#fnref9" class="footnote-back">↩︎</a></p></li>
+<li id="fn10"><p>2016-2020 GSS Panel Codebook Release 1a: <a href="https://gss.norc.org/Documents/codebook/2016-2020%20GSS%20Panel%20Codebook%20-%20R1a.pdf" class="uri">https://gss.norc.org/Documents/codebook/2016-2020%20GSS%20Panel%20Codebook%20-%20R1a.pdf</a><a href="c03-specifying-sample-designs.html#fnref10" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c04-understanding-survey-data-documentation.html b/c04-understanding-survey-data-documentation.html
index 326bdaf9..0c9487bb 100644
--- a/c04-understanding-survey-data-documentation.html
+++ b/c04-understanding-survey-data-documentation.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c04-understanding-survey-data-documentation" class="section level1 hasAnchor" number="4">
 <h1><span class="header-section-number">Chapter 4</span> Understanding survey data documentation<a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq4" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c04-understanding-survey-data-documentation.html#prereq4" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -414,15 +414,15 @@ <h3><span class="header-section-number">4.2.1</span> Technical documentation<a h
 <h3><span class="header-section-number">4.2.2</span> Questionnaires<a href="c04-understanding-survey-data-documentation.html#questionnaires" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 <p>A questionnaire is a series of questions asked to obtain information from survey respondents. A questionnaire gathers opinions, behaviors, or demographic data by employing different types of questions, such as closed-ended (e.g., radio button select one or check all that apply), open-ended (e.g., numeric or text), Likert scales, or ranking questions. It may randomize the display order of responses or include instructions to help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope.</p>
 <p>The questionnaire is an essential resource for understanding and interpreting the survey data (see Section <a href="c02-overview-surveys.html#overview-design-questionnaire">2.2.3</a>), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (if only a subset of respondents were asked the question).</p>
-<p>Below in Figure <a href="c04-understanding-survey-data-documentation.html#fig:que-examp">4.1</a>, we show a question from the ANES 2020 questionnaire <span class="citation">(<a href="#ref-anes-svy" role="doc-biblioref">American National Election Studies 2021</a>)</span>. This figure shows a particular question’s question name (<code>postvote_rvote</code>), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if <code>vote_pre</code> = 0), and other specifications. The section also includes the variable name, which we can link to the codebook.</p>
-<div class="figure"><span style="display:block;" id="fig:que-examp"></span>
+<p>Below in Figure <a href="c04-understanding-survey-data-documentation.html#fig:understand-que-examp">4.1</a>, we show a question from the ANES 2020 questionnaire <span class="citation">(<a href="#ref-anes-svy" role="doc-biblioref">American National Election Studies 2021</a>)</span>. This figure shows a particular question’s question name (<code>postvote_rvote</code>), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if <code>vote_pre</code> = 0), and other specifications. The section also includes the variable name, which we can link to the codebook.</p>
+<div class="figure"><span style="display:block;" id="fig:understand-que-examp"></span>
 <img src="images/questionnaire-example.jpg" alt="Question information about the variable postvote_rvote from ANES 2020 questionnaire Survey question, Universe, Logic, Web Spec, Response Order, and Released Variable are included."  />
 <p class="caption">
 FIGURE 4.1: ANES 2020 Questionnaire Example
 </p>
 </div>
-<p>The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure <a href="c04-understanding-survey-data-documentation.html#fig:que-examp-2">4.2</a> shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) <span class="citation">(<a href="#ref-brfss-svy" role="doc-biblioref">Centers for Disease Control and Prevention (CDC) 2021</a>)</span>.</p>
-<div class="figure"><span style="display:block;" id="fig:que-examp-2"></span>
+<p>The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure <a href="c04-understanding-survey-data-documentation.html#fig:understand-que-examp-2">4.2</a> shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) <span class="citation">(<a href="#ref-brfss-svy" role="doc-biblioref">Centers for Disease Control and Prevention (CDC) 2021</a>)</span>.</p>
+<div class="figure"><span style="display:block;" id="fig:understand-que-examp-2"></span>
 <img src="images/questionnaire-example-2.jpg" alt="Question information about the variable BPHIGH6 from BRFSS 2021 questionnaire. Question number, question text, variable names, responses, skip info and CATI note, interviewer notes, and columns are included."  />
 <p class="caption">
 FIGURE 4.2: BRFSS 2021 Questionnaire Example
@@ -433,14 +433,14 @@ <h3><span class="header-section-number">4.2.2</span> Questionnaires<a href="c04-
 <div id="codebooks" class="section level3 hasAnchor" number="4.2.3">
 <h3><span class="header-section-number">4.2.3</span> Codebooks<a href="c04-understanding-survey-data-documentation.html#codebooks" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 <p>While a questionnaire provides information about the questions asked to respondents, the codebook explains how the survey data was coded and recorded. The codebook lists details such as variable names, variable labels, variable meanings, codes for missing data, values labels, and value types (whether categorical or continuous, etc.). In particular, the codebook often includes information on missing data (as opposed to the questionnaire). The codebook enables us to understand and use the variables appropriately in our analysis.</p>
-<p>Figure <a href="c04-understanding-survey-data-documentation.html#fig:codebook-examp">4.3</a> is a question from the ANES 2020 codebook <span class="citation">(<a href="#ref-anes-cb" role="doc-biblioref">American National Election Studies 2022</a>)</span>. This part indicates a particular variable’s name (<code>V202066</code>), question wording, value labels, universe, and associated survey question (<code>postvote_rvote</code>).</p>
-<div class="figure"><span style="display:block;" id="fig:codebook-examp"></span>
+<p>Figure <a href="c04-understanding-survey-data-documentation.html#fig:understand-codebook-examp">4.3</a> is a question from the ANES 2020 codebook <span class="citation">(<a href="#ref-anes-cb" role="doc-biblioref">American National Election Studies 2022</a>)</span>. This part indicates a particular variable’s name (<code>V202066</code>), question wording, value labels, universe, and associated survey question (<code>postvote_rvote</code>).</p>
+<div class="figure"><span style="display:block;" id="fig:understand-codebook-examp"></span>
 <img src="images/codebook-example.jpg" alt="Variable information about the variable V202066 from ANES 2020 questionnaire Variable meaning, Value labels, Universe, and Survey Question(s) are included."  />
 <p class="caption">
 FIGURE 4.3: ANES 2020 Codebook Example
 </p>
 </div>
-<p>Reviewing both questionnaires and codebooks in parallel is important (Figures <a href="c04-understanding-survey-data-documentation.html#fig:que-examp">4.1</a> and <a href="c04-understanding-survey-data-documentation.html#fig:codebook-examp">4.3</a>, as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables.</p>
+<p>Reviewing both questionnaires and codebooks in parallel is important (Figures <a href="c04-understanding-survey-data-documentation.html#fig:understand-que-examp">4.1</a> and <a href="c04-understanding-survey-data-documentation.html#fig:understand-codebook-examp">4.3</a>, as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables.</p>
 </div>
 <div id="errata" class="section level3 hasAnchor" number="4.2.4">
 <h3><span class="header-section-number">4.2.4</span> Errata<a href="c04-understanding-survey-data-documentation.html#errata" class="anchor-section" aria-label="Anchor link to header"></a></h3>
@@ -465,7 +465,7 @@ <h2><span class="header-section-number">4.3</span> Working with missing data<a h
 <li><p><strong>Missing not at random (MNAR)</strong>: The missing data is related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity.</p></li>
 </ol></li>
 </ol>
-<p>The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have “Yes” responses coded to <code>1</code>, “No” responses coded to <code>2</code>, and missing responses coded to <code>-9</code>. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable <code>V202066</code> from the ANES (Figure <a href="c04-understanding-survey-data-documentation.html#fig:codebook-examp">4.3</a>), <code>-9</code> represents “Refused,” <code>-7</code> means that the response was deleted due to an incomplete interview, <code>-6</code> means that there is no response because there was no follow-up interview, and <code>-1</code> means “Inapplicable” (due to the designed skip pattern).</p>
+<p>The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have “Yes” responses coded to <code>1</code>, “No” responses coded to <code>2</code>, and missing responses coded to <code>-9</code>. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable <code>V202066</code> from the ANES (Figure <a href="c04-understanding-survey-data-documentation.html#fig:understand-codebook-examp">4.3</a>), <code>-9</code> represents “Refused,” <code>-7</code> means that the response was deleted due to an incomplete interview, <code>-6</code> means that there is no response because there was no follow-up interview, and <code>-1</code> means “Inapplicable” (due to the designed skip pattern).</p>
 <p>When running analysis in R, we must handle missing responses as missing data (i.e., <code>NA</code>) and not numeric data. If missing responses are treated as zeros or arbitrary values, they can artificially alter summary statistics or introduce spurious patterns in the analysis. Recoding these values to <code>NA</code> will allow you to handle missing data in different ways in R, such as using functions like <code>na.omit()</code>, <code>complete.cases()</code>, or specialized packages like {tidyimpute} or {mice}. These tools allow us to treat missing responses as missing data to conduct your analysis accurately and obtain valid results.</p>
 <p>Visualizing the missing data can also help to inform the types of missing data that are present. The {naniar} package provides many valuable missing data visualizations, such as using <code>gg_miss_var()</code> to see the count or percent of missing data points by variable or <code>gg_miss_fct()</code> to see relationships in missing data across levels of a factor variable. Investigating the relationships and nature of the missing data before running models can ensure that the missing data is accurately accounted for.</p>
 <div id="accounting-for-questionnaire-skip-patterns" class="section level3 hasAnchor" number="4.3.1">
diff --git a/c05-descriptive-analysis.html b/c05-descriptive-analysis.html
index b9b0d520..4546c620 100644
--- a/c05-descriptive-analysis.html
+++ b/c05-descriptive-analysis.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,25 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c05-descriptive-analysis" class="section level1 hasAnchor" number="5">
 <h1><span class="header-section-number">Chapter 5</span> Descriptive analyses in srvyr<a href="c05-descriptive-analysis.html#c05-descriptive-analysis" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<table>
-<caption><span id="tab:desc-summary-tab">TABLE 5.1: </span>Summary of Chapter 5</caption>
-<tbody>
-<tr class="odd">
-<td align="left"><strong>Topic</strong></td>
-<td align="left">Descriptive analysis of survey data</td>
-</tr>
-<tr class="even">
-<td align="left"><strong>Purpose</strong></td>
-<td align="left">purpose-blah</td>
-</tr>
-<tr class="odd">
-<td align="left"><strong>Learning Goals</strong></td>
-<td align="left">learning-goals-blah</td>
-</tr>
-</tbody>
-</table>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq5" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c05-descriptive-analysis.html#prereq5" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -699,7 +682,7 @@ <h4>Examples<a href="c05-descriptive-analysis.html#examples-1" class="anchor-sec
 ##        &lt;dbl&gt;  &lt;dbl&gt;
 ## 1 118208250. 0.0320</code></pre>
 <p>Note that the result from <code>recs_des %&gt;% summarize(survey_total())</code> is equivalent to the <code>survey_count()</code> call. However, the <code>survey_total()</code> function is called within <code>summarize</code>, where as <code>survey_count()</code>, like <code>dplyr::count()</code>, is not.</p>
-<p>The difference between <code>survey_total()</code> and <code>survey_count()</code> is more evident when specifying continuous variables to sum. Let’s compute the total cost of electricity in whole dollars from variable <code>DOLLAREL</code><a href="#fn12" class="footnote-ref" id="fnref12"><sup>12</sup></a>. We also calculate an unweighted estimate using <code>unweighted()</code>. The <code>unweighted()</code> function calculates unweighted summaries from <code>tbl_svy</code> object which reflects the summary among the <strong>respondents</strong> and does not extrapolate to a population estimate.</p>
+<p>The difference between <code>survey_total()</code> and <code>survey_count()</code> is more evident when specifying continuous variables to sum. Let’s compute the total cost of electricity in whole dollars from variable <code>DOLLAREL</code><a href="#fn11" class="footnote-ref" id="fnref11"><sup>11</sup></a>. We also calculate an unweighted estimate using <code>unweighted()</code>. The <code>unweighted()</code> function calculates unweighted summaries from <code>tbl_svy</code> object which reflects the summary among the <strong>respondents</strong> and does not extrapolate to a population estimate.</p>
 <div class="sourceCode" id="cb128"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb128-1"><a href="c05-descriptive-analysis.html#cb128-1" aria-hidden="true" tabindex="-1"></a>recs_des <span class="sc">%&gt;%</span></span>
 <span id="cb128-2"><a href="c05-descriptive-analysis.html#cb128-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summarize</span>(</span>
 <span id="cb128-3"><a href="c05-descriptive-analysis.html#cb128-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">elec_bill =</span> <span class="fu">survey_total</span>(DOLLAREL),</span>
@@ -776,7 +759,7 @@ <h4>Examples<a href="c05-descriptive-analysis.html#examples-2" class="anchor-sec
 ## 2 Midwest   0.223 6.07e-11
 ## 3 South     0.376 1.43e-10
 ## 4 West      0.223 1.37e-10</code></pre>
-<p>Getting proportions by more than one variable is possible. In the next example, we look at the proportion of housing units by Region and whether air-conditioning is used (<code>ACUsed</code>).<a href="#fn13" class="footnote-ref" id="fnref13"><sup>13</sup></a></p>
+<p>Getting proportions by more than one variable is possible. In the next example, we look at the proportion of housing units by Region and whether air-conditioning is used (<code>ACUsed</code>).<a href="#fn12" class="footnote-ref" id="fnref12"><sup>12</sup></a></p>
 <div class="sourceCode" id="cb138"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb138-1"><a href="c05-descriptive-analysis.html#cb138-1" aria-hidden="true" tabindex="-1"></a>recs_des <span class="sc">%&gt;%</span></span>
 <span id="cb138-2"><a href="c05-descriptive-analysis.html#cb138-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(Region, ACUsed) <span class="sc">%&gt;%</span></span>
 <span id="cb138-3"><a href="c05-descriptive-analysis.html#cb138-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summarize</span>(<span class="at">p=</span><span class="fu">survey_mean</span>())</span></code></pre></div>
@@ -1103,7 +1086,7 @@ <h4>Syntax<a href="c05-descriptive-analysis.html#syntax-7" class="anchor-section
 </div>
 <div id="examples-7" class="section level4 unnumbered hasAnchor">
 <h4>Examples<a href="c05-descriptive-analysis.html#examples-7" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>We can calculate the correlation between total square footage (<code>TOTSQFT_EN</code>)<a href="#fn14" class="footnote-ref" id="fnref14"><sup>14</sup></a> and electricity consumption (<code>BTUEL</code>)<a href="#fn15" class="footnote-ref" id="fnref15"><sup>15</sup></a>.</p>
+<p>We can calculate the correlation between total square footage (<code>TOTSQFT_EN</code>)<a href="#fn13" class="footnote-ref" id="fnref13"><sup>13</sup></a> and electricity consumption (<code>BTUEL</code>)<a href="#fn14" class="footnote-ref" id="fnref14"><sup>14</sup></a>.</p>
 <div class="sourceCode" id="cb167"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb167-1"><a href="c05-descriptive-analysis.html#cb167-1" aria-hidden="true" tabindex="-1"></a>recs_des <span class="sc">%&gt;%</span></span>
 <span id="cb167-2"><a href="c05-descriptive-analysis.html#cb167-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summarize</span>(</span>
 <span id="cb167-3"><a href="c05-descriptive-analysis.html#cb167-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">SQFT_Elec_Corr=</span><span class="fu">survey_corr</span>(TOTSQFT_EN, BTUEL)</span>
@@ -1598,7 +1581,7 @@ <h4>across() Example 2<a href="c05-descriptive-analysis.html#across-example-2" c
 <div id="map-example" class="section level4 unnumbered hasAnchor">
 <h4>map example<a href="c05-descriptive-analysis.html#map-example" class="anchor-section" aria-label="Anchor link to header"></a></h4>
 <p>If you want to calculate something again and again, loops are a common tool. The {purrr} package has the <code>map()</code> functions which like a loop allows you to do something in the same way many times. In our case, we want to calculate proportions from the same design multiple times. We find an easy way to do this is to think about how you would do it for one outcome, build a function from there, and then iterate.</p>
-<p>Suppose, we want to create a table that shows the proportion of people that trust in their government (<code>TrustGovernment</code>)<a href="#fn16" class="footnote-ref" id="fnref16"><sup>16</sup></a> as well as those that trust in people (<code>TrustPeople</code>)<a href="#fn17" class="footnote-ref" id="fnref17"><sup>17</sup></a>.</p>
+<p>Suppose, we want to create a table that shows the proportion of people that trust in their government (<code>TrustGovernment</code>)<a href="#fn15" class="footnote-ref" id="fnref15"><sup>15</sup></a> as well as those that trust in people (<code>TrustPeople</code>)<a href="#fn16" class="footnote-ref" id="fnref16"><sup>16</sup></a>.</p>
 <p>In the example below, we create a table that has the variable name as a column, the answer as a column, and then the percentage and its standard error.</p>
 <div class="sourceCode" id="cb225"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb225-1"><a href="c05-descriptive-analysis.html#cb225-1" aria-hidden="true" tabindex="-1"></a>anes_des <span class="sc">%&gt;%</span></span>
 <span id="cb225-2"><a href="c05-descriptive-analysis.html#cb225-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">drop_na</span>(TrustGovernment) <span class="sc">%&gt;%</span></span>
@@ -1689,13 +1672,13 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="12">
-<li id="fn12"><p>RECS has two components: a household survey and an energy supplier survey. For each household that responds, their energy provider(s) are contacted to obtain their energy consumption and expenditure. This value reflects the dollars spent on electricity in 2015 according to the energy supplier. See <a href="https://www.eia.gov/consumption/residential/reports/2015/methodology/pdf/2015C&amp;EMethodology.pdf" class="uri">https://www.eia.gov/consumption/residential/reports/2015/methodology/pdf/2015C&amp;EMethodology.pdf</a> for more details.<a href="c05-descriptive-analysis.html#fnref12" class="footnote-back">↩︎</a></p></li>
-<li id="fn13"><p>Question text: Is any air condition equipment used in your home?<a href="c05-descriptive-analysis.html#fnref13" class="footnote-back">↩︎</a></p></li>
-<li id="fn14"><p>Question text: What is the square footage of your home?<a href="c05-descriptive-analysis.html#fnref14" class="footnote-back">↩︎</a></p></li>
-<li id="fn15"><p>BTUEL is derived from the supplier side component of the survey where BTUEL represents the electricity consumption in British thermal units (Btus) converted from kilowatt hours (kWh) in a year<a href="c05-descriptive-analysis.html#fnref15" class="footnote-back">↩︎</a></p></li>
-<li id="fn16"><p>Question: How often can you trust the federal government in Washington to do what is right? (Always, most of the time, about half the time, some of the time, or never / Never, some of the time, about half the time, most of the time, or always)?<a href="c05-descriptive-analysis.html#fnref16" class="footnote-back">↩︎</a></p></li>
-<li id="fn17"><p>Question: Generally speaking, how often can you trust other people? (Always, most of the time, about half the time, some of the time, or never / Never, some of the time, about half the time, most of the time, or always)? <a href="c05-descriptive-analysis.html#fnref17" class="footnote-back">↩︎</a></p></li>
+<ol start="11">
+<li id="fn11"><p>RECS has two components: a household survey and an energy supplier survey. For each household that responds, their energy provider(s) are contacted to obtain their energy consumption and expenditure. This value reflects the dollars spent on electricity in 2015 according to the energy supplier. See <a href="https://www.eia.gov/consumption/residential/reports/2015/methodology/pdf/2015C&amp;EMethodology.pdf" class="uri">https://www.eia.gov/consumption/residential/reports/2015/methodology/pdf/2015C&amp;EMethodology.pdf</a> for more details.<a href="c05-descriptive-analysis.html#fnref11" class="footnote-back">↩︎</a></p></li>
+<li id="fn12"><p>Question text: Is any air condition equipment used in your home?<a href="c05-descriptive-analysis.html#fnref12" class="footnote-back">↩︎</a></p></li>
+<li id="fn13"><p>Question text: What is the square footage of your home?<a href="c05-descriptive-analysis.html#fnref13" class="footnote-back">↩︎</a></p></li>
+<li id="fn14"><p>BTUEL is derived from the supplier side component of the survey where BTUEL represents the electricity consumption in British thermal units (Btus) converted from kilowatt hours (kWh) in a year<a href="c05-descriptive-analysis.html#fnref14" class="footnote-back">↩︎</a></p></li>
+<li id="fn15"><p>Question: How often can you trust the federal government in Washington to do what is right? (Always, most of the time, about half the time, some of the time, or never / Never, some of the time, about half the time, most of the time, or always)?<a href="c05-descriptive-analysis.html#fnref15" class="footnote-back">↩︎</a></p></li>
+<li id="fn16"><p>Question: Generally speaking, how often can you trust other people? (Always, most of the time, about half the time, some of the time, or never / Never, some of the time, about half the time, most of the time, or always)? <a href="c05-descriptive-analysis.html#fnref16" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c06-statistical-testing.html b/c06-statistical-testing.html
index f0d6af81..976303ca 100644
--- a/c06-statistical-testing.html
+++ b/c06-statistical-testing.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c06-statistical-testing" class="section level1 hasAnchor" number="6">
 <h1><span class="header-section-number">Chapter 6</span> Statistical testing<a href="c06-statistical-testing.html#c06-statistical-testing" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq6" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c06-statistical-testing.html#prereq6" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -675,7 +675,7 @@ <h3><span class="header-section-number">6.3.1</span> Syntax<a href="c06-statisti
 <span id="cb258-6"><a href="c06-statistical-testing.html#cb258-6" aria-hidden="true" tabindex="-1"></a>  <span class="at">na.rm =</span> <span class="cn">TRUE</span>,</span>
 <span id="cb258-7"><a href="c06-statistical-testing.html#cb258-7" aria-hidden="true" tabindex="-1"></a>  ...</span>
 <span id="cb258-8"><a href="c06-statistical-testing.html#cb258-8" aria-hidden="true" tabindex="-1"></a>)</span></code></pre></div>
-<p>There are six statistics that are accepted in this formula. For tests of homogeneity (when comparing cross-tabulations), the <code>F</code> or <code>Chisq</code> statistics should be used.<a href="#fn18" class="footnote-ref" id="fnref18"><sup>18</sup></a> The <code>F</code> statistic is the default and uses the Rao-Scott second-order correction. This correction is designed to assist with complicated sampling designs (i.e., those other than a simple random sample) (CITE)<a href="#fn19" class="footnote-ref" id="fnref19"><sup>19</sup></a>. The <code>Chisq</code> statistic is an adjusted version of the Pearson <span class="math inline">\(\chi^2\)</span> statistic. The version of this statistic in the <code>svychisq()</code> function compares the design effect estimate from the provided survey data to what the <span class="math inline">\(\chi^2\)</span> distribution would have been if the data came from a simple random sampling.</p>
+<p>There are six statistics that are accepted in this formula. For tests of homogeneity (when comparing cross-tabulations), the <code>F</code> or <code>Chisq</code> statistics should be used.<a href="#fn17" class="footnote-ref" id="fnref17"><sup>17</sup></a> The <code>F</code> statistic is the default and uses the Rao-Scott second-order correction. This correction is designed to assist with complicated sampling designs (i.e., those other than a simple random sample) (CITE)<a href="#fn18" class="footnote-ref" id="fnref18"><sup>18</sup></a>. The <code>Chisq</code> statistic is an adjusted version of the Pearson <span class="math inline">\(\chi^2\)</span> statistic. The version of this statistic in the <code>svychisq()</code> function compares the design effect estimate from the provided survey data to what the <span class="math inline">\(\chi^2\)</span> distribution would have been if the data came from a simple random sampling.</p>
 <p>For tests of independence, the <code>Wald</code> and <code>adjWald</code> are recommended as they provide a better adjustment for variable comparisons (<span class="citation">Lumley (<a href="#ref-lumley2010complex" role="doc-biblioref">2010</a>)</span>). If the data has a small number of primary sampling units (PSUs) compared to the degrees of freedom, then the <code>adjWald</code> statistic should be used to account for this. The <code>lincom</code> and <code>saddlepoint</code> statistics are available for more complicated data structures.<!--I'm struggling to find a lot of information around these last two statistics, any thoughts/suggestions?--></p>
 <p>The formula argument will always be one-sided, unlike the <code>svyttest()</code> function. The two variables of interest should be included with a plus sign: <code>formula = ~ var_1 + var_2</code>. As with the <code>svygofchisq()</code> function, the variables entered into the formula should be formatted as either a factor or a character.</p>
 <p>Additionally, as with the t-test function, both <code>svygofchisq()</code> and <code>svychisq()</code> have the <code>na.rm</code> argument. This argument defaults to <code>FALSE</code>; however, unlike the t-test function, if any data is missing, the <span class="math inline">\(\chi^2\)</span> tests will assume that <code>NA</code> is a category and will include it in the calculation. Throughout this chapter, we will always set <code>na.rm = TRUE</code>, but before analyzing the survey data, review the notes provided in Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a> to better understand how to handle missing data.</p>
@@ -685,7 +685,7 @@ <h3><span class="header-section-number">6.3.2</span> Examples<a href="c06-statis
 <p>Let’s walk through a few examples using the ANES data.</p>
 <div id="stattest-chi-ex1" class="section level4 unnumbered hasAnchor">
 <h4>Example 1: Goodness of Fit Test<a href="c06-statistical-testing.html#stattest-chi-ex1" class="anchor-section" aria-label="Anchor link to header"></a></h4>
-<p>ANES asked respondents about their highest education level. Based on the data from the 2020 American Community Survey (ACS) 5-year estimates<a href="#fn20" class="footnote-ref" id="fnref20"><sup>20</sup></a>, the education distribution of those 18+ in the U.S. is as follows:
+<p>ANES asked respondents about their highest education level. Based on the data from the 2020 American Community Survey (ACS) 5-year estimates<a href="#fn19" class="footnote-ref" id="fnref19"><sup>19</sup></a>, the education distribution of those 18+ in the U.S. is as follows:
 - 11% had less than High School degree
 - 27% had a High School degree
 - 29% had some college or associate’s degree
@@ -2523,10 +2523,10 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="18">
-<li id="fn18"><p>These two statistics can also be used for goodness of fit tests, if the <code>svygofchisq()</code> function is not used.<a href="c06-statistical-testing.html#fnref18" class="footnote-back">↩︎</a></p></li>
-<li id="fn19"><p><a href="http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf" class="uri">http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf</a><a href="c06-statistical-testing.html#fnref19" class="footnote-back">↩︎</a></p></li>
-<li id="fn20"><p>Data was pulled from data.census.gov using the S1501 Education Attainment 2020: ACS 5-Year Estimates Subject Tables<a href="c06-statistical-testing.html#fnref20" class="footnote-back">↩︎</a></p></li>
+<ol start="17">
+<li id="fn17"><p>These two statistics can also be used for goodness of fit tests, if the <code>svygofchisq()</code> function is not used.<a href="c06-statistical-testing.html#fnref17" class="footnote-back">↩︎</a></p></li>
+<li id="fn18"><p><a href="http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf" class="uri">http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf</a><a href="c06-statistical-testing.html#fnref18" class="footnote-back">↩︎</a></p></li>
+<li id="fn19"><p>Data was pulled from data.census.gov using the S1501 Education Attainment 2020: ACS 5-Year Estimates Subject Tables<a href="c06-statistical-testing.html#fnref19" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c07-modeling.html b/c07-modeling.html
index 748d8d54..4ec5ed1a 100644
--- a/c07-modeling.html
+++ b/c07-modeling.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c07-modeling" class="section level1 hasAnchor" number="7">
 <h1><span class="header-section-number">Chapter 7</span> Modeling<a href="c07-modeling.html#c07-modeling" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq7" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c07-modeling.html#prereq7" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -417,7 +417,7 @@ <h2><span class="header-section-number">7.1</span> Introduction<a href="c07-mode
 <p>For example, a linear formula mathematically specified as</p>
 <p><span class="math display">\[Y_i=\beta_0+\beta_1 X_i+\epsilon_i\]</span> would be specified in R as <code>y~x</code> where the intercept is not explicitly included. To fit a model with no intercept, that is,</p>
 <p><span class="math display">\[Y_i=\beta_1 X_i+\epsilon_i\]</span>
-it can be specified as <code>y~x-1</code>. Formula notation details in R can be found in the help file for formula<a href="#fn21" class="footnote-ref" id="fnref21"><sup>21</sup></a>. A quick overview of the common formula notation is in the following table:</p>
+it can be specified as <code>y~x-1</code>. Formula notation details in R can be found in the help file for formula<a href="#fn20" class="footnote-ref" id="fnref20"><sup>20</sup></a>. A quick overview of the common formula notation is in the following table:</p>
 <table>
 <caption>Common symbols in formula notation</caption>
 <colgroup>
@@ -478,7 +478,7 @@ <h2><span class="header-section-number">7.1</span> Introduction<a href="c07-mode
 <li><code>mpg~cyl:disp:hp</code> this only has the interactions and not the main effect</li>
 <li><code>mpg~cyl*disp*hp</code> this also has the 3-way interaction in addition to the main effects and 2-way interactions</li>
 </ul>
-<p>When using raw data, researchers will use the <code>glm()</code> function. With survey data, however, we use <code>svyglm()</code> from the {survey} package to ensure that we account for the survey design and weights in modeling<a href="#fn22" class="footnote-ref" id="fnref22"><sup>22</sup></a>. This allows us to generalize a model to the target population and accounts for the fact that the observations in the survey data may not be independent. As discussed in Chapter <a href="c06-statistical-testing.html#c06-statistical-testing">6</a>, modeling survey data cannot be directly done in {srvyr}, but can be done in the {survey} <span class="citation">(<a href="#ref-lumley2010complex" role="doc-biblioref">Lumley 2010</a>, <a href="#ref-R-survey" role="doc-biblioref">2023</a>)</span> package. In this chapter, we will provide syntax and examples for linear models, including ANOVA, Gaussian linear regression, and logistic regression. For details on other types of regression, including ordinal regression, log-linear models, and survival analysis, refer to <span class="citation">Lumley (<a href="#ref-lumley2010complex" role="doc-biblioref">2010</a>)</span>. <span class="citation">Lumley (<a href="#ref-lumley2010complex" role="doc-biblioref">2010</a>)</span> also discusses custom models such as a negative binomial or Poisson model in Appendix E of his book.</p>
+<p>When using raw data, researchers will use the <code>glm()</code> function. With survey data, however, we use <code>svyglm()</code> from the {survey} package to ensure that we account for the survey design and weights in modeling<a href="#fn21" class="footnote-ref" id="fnref21"><sup>21</sup></a>. This allows us to generalize a model to the target population and accounts for the fact that the observations in the survey data may not be independent. As discussed in Chapter <a href="c06-statistical-testing.html#c06-statistical-testing">6</a>, modeling survey data cannot be directly done in {srvyr}, but can be done in the {survey} <span class="citation">(<a href="#ref-lumley2010complex" role="doc-biblioref">Lumley 2010</a>, <a href="#ref-R-survey" role="doc-biblioref">2023</a>)</span> package. In this chapter, we will provide syntax and examples for linear models, including ANOVA, Gaussian linear regression, and logistic regression. For details on other types of regression, including ordinal regression, log-linear models, and survival analysis, refer to <span class="citation">Lumley (<a href="#ref-lumley2010complex" role="doc-biblioref">2010</a>)</span>. <span class="citation">Lumley (<a href="#ref-lumley2010complex" role="doc-biblioref">2010</a>)</span> also discusses custom models such as a negative binomial or Poisson model in Appendix E of his book.</p>
 </div>
 <div id="analysis-of-variance-anova" class="section level2 hasAnchor" number="7.2">
 <h2><span class="header-section-number">7.2</span> Analysis of Variance (ANOVA)<a href="c07-modeling.html#analysis-of-variance-anova" class="anchor-section" aria-label="Anchor link to header"></a></h2>
@@ -503,7 +503,7 @@ <h3><span class="header-section-number">7.2.1</span> Syntax<a href="c07-modeling
 <span id="cb291-5"><a href="c07-modeling.html#cb291-5" aria-hidden="true" tabindex="-1"></a>    <span class="at">na.action =</span> na.omit,</span>
 <span id="cb291-6"><a href="c07-modeling.html#cb291-6" aria-hidden="true" tabindex="-1"></a>    <span class="at">df.resid =</span> <span class="fu">degf</span>(.)</span>
 <span id="cb291-7"><a href="c07-modeling.html#cb291-7" aria-hidden="true" tabindex="-1"></a>  )</span></code></pre></div>
-<p>where <code>des_obj</code> is a design object, <code>outcomevar</code> is the outcome variable, <code>groupvar</code> is the group variable, and <code>na.action=na.omit</code> is set so that records with missing data in the outcome or group variable are removed for prediction<a href="#fn23" class="footnote-ref" id="fnref23"><sup>23</sup></a>. The function <code>svyglm()</code> does not have the design as the first argument so the dot (<code>.</code>) notation is used to pass it with a pipe (see Chapter <a href="c06-statistical-testing.html#c06-statistical-testing">6</a> for more details).</p>
+<p>where <code>des_obj</code> is a design object, <code>outcomevar</code> is the outcome variable, <code>groupvar</code> is the group variable, and <code>na.action=na.omit</code> is set so that records with missing data in the outcome or group variable are removed for prediction<a href="#fn22" class="footnote-ref" id="fnref22"><sup>22</sup></a>. The function <code>svyglm()</code> does not have the design as the first argument so the dot (<code>.</code>) notation is used to pass it with a pipe (see Chapter <a href="c06-statistical-testing.html#c06-statistical-testing">6</a> for more details).</p>
 </div>
 <div id="example-10" class="section level3 hasAnchor" number="7.2.2">
 <h3><span class="header-section-number">7.2.2</span> Example<a href="c07-modeling.html#example-10" class="anchor-section" aria-label="Anchor link to header"></a></h3>
@@ -537,7 +537,7 @@ <h3><span class="header-section-number">7.2.2</span> Example<a href="c07-modelin
 ## 2 RegionMidwest    0.874     0.253      3.46 8.18e-  4
 ## 3 RegionSouth      1.49      0.231      6.45 5.20e-  9
 ## 4 RegionWest       1.66      0.353      4.70 9.27e-  6</code></pre>
-<p>In the output above, we can see the estimated coefficients (<code>estimate</code>), estimated standard errors of the coefficients (<code>std.error</code>), the t-statistic (<code>statistic</code>), and the p-value for each coefficient. In this output, the intercept represents the reference value of the Northeast region<a href="#fn24" class="footnote-ref" id="fnref24"><sup>24</sup></a>. The other coefficients indicate the difference in temperature relative to the Northeast region. For example, in the Midwest, temperatures are set, on average, 0.874 degrees higher than in the Northeast during summer nights.</p>
+<p>In the output above, we can see the estimated coefficients (<code>estimate</code>), estimated standard errors of the coefficients (<code>std.error</code>), the t-statistic (<code>statistic</code>), and the p-value for each coefficient. In this output, the intercept represents the reference value of the Northeast region<a href="#fn23" class="footnote-ref" id="fnref23"><sup>23</sup></a>. The other coefficients indicate the difference in temperature relative to the Northeast region. For example, in the Midwest, temperatures are set, on average, 0.874 degrees higher than in the Northeast during summer nights.</p>
 </div>
 </div>
 <div id="gaussian-linear-regression" class="section level2 hasAnchor" number="7.3">
@@ -736,7 +736,7 @@ <h3><span class="header-section-number">7.4.1</span> Syntax<a href="c07-modeling
 </div>
 <div id="example-12" class="section level3 hasAnchor" number="7.4.2">
 <h3><span class="header-section-number">7.4.2</span> Example<a href="c07-modeling.html#example-12" class="anchor-section" aria-label="Anchor link to header"></a></h3>
-<p>In the following example, the ANES data is used, and we are modeling whether someone usually has trust in the government<a href="#fn25" class="footnote-ref" id="fnref25"><sup>25</sup></a> by who someone voted for in 2020. As a reminder, the leading candidates were Biden and Trump though people could vote for someone else not in the Democratic or Republican parties. Those votes are all grouped into an “Other” category. We first create a binary outcome for trusting in the government and plot the data. A scatter plot of the raw data is not useful as it is all 0 and 1 outcomes, so instead, we plot a summary of the data.</p>
+<p>In the following example, the ANES data is used, and we are modeling whether someone usually has trust in the government<a href="#fn24" class="footnote-ref" id="fnref24"><sup>24</sup></a> by who someone voted for in 2020. As a reminder, the leading candidates were Biden and Trump though people could vote for someone else not in the Democratic or Republican parties. Those votes are all grouped into an “Other” category. We first create a binary outcome for trusting in the government and plot the data. A scatter plot of the raw data is not useful as it is all 0 and 1 outcomes, so instead, we plot a summary of the data.</p>
 <div class="sourceCode" id="cb311"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb311-1"><a href="c07-modeling.html#cb311-1" aria-hidden="true" tabindex="-1"></a>anes_des_der <span class="ot">&lt;-</span> anes_des <span class="sc">%&gt;%</span></span>
 <span id="cb311-2"><a href="c07-modeling.html#cb311-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">TrustGovernmentUsually =</span> <span class="fu">case_when</span>(</span>
 <span id="cb311-3"><a href="c07-modeling.html#cb311-3" aria-hidden="true" tabindex="-1"></a>    <span class="fu">is.na</span>(TrustGovernment) <span class="sc">~</span> <span class="cn">NA</span>,</span>
@@ -861,7 +861,7 @@ <h2><span class="header-section-number">7.5</span> Exercises<a href="c07-modelin
 <div class="sourceCode" id="cb323"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb323-1"><a href="c07-modeling.html#cb323-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Single-family detached units are most common</span></span>
 <span id="cb323-2"><a href="c07-modeling.html#cb323-2" aria-hidden="true" tabindex="-1"></a><span class="co"># There is a significant relationship between energy expenditure and housing unit type</span></span></code></pre></div>
 <ol start="2" style="list-style-type: decimal">
-<li>Does temperature play a role in energy expenditure? Cooling degree days are a measure of how hot a place is. CDD65 for a given day indicates the number of degrees Fahrenheit warmer than 65°F (18.3°C) it is in a location. On a day that averages 65°F and below, CDD65=0. While a day that averages 85°F would have CDD80=20 because it is 20 degrees warmer. For each day in the year, this is summed to give an indicator of how hot the place is throughout the year. Similarly, HDD65 indicates the days colder than 65°F (18.3°C)<a href="#fn26" class="footnote-ref" id="fnref26"><sup>26</sup></a>. Can energy expenditure be predicted using these temperature indicators along with square footage? Is there a significant relationship? Include main effects and two-way interactions.</li>
+<li>Does temperature play a role in energy expenditure? Cooling degree days are a measure of how hot a place is. CDD65 for a given day indicates the number of degrees Fahrenheit warmer than 65°F (18.3°C) it is in a location. On a day that averages 65°F and below, CDD65=0. While a day that averages 85°F would have CDD80=20 because it is 20 degrees warmer. For each day in the year, this is summed to give an indicator of how hot the place is throughout the year. Similarly, HDD65 indicates the days colder than 65°F (18.3°C)<a href="#fn25" class="footnote-ref" id="fnref25"><sup>25</sup></a>. Can energy expenditure be predicted using these temperature indicators along with square footage? Is there a significant relationship? Include main effects and two-way interactions.</li>
 </ol>
 <div class="sourceCode" id="cb324"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb324-1"><a href="c07-modeling.html#cb324-1" aria-hidden="true" tabindex="-1"></a>temps_sqft_exp <span class="ot">&lt;-</span> recs_des <span class="sc">%&gt;%</span></span>
 <span id="cb324-2"><a href="c07-modeling.html#cb324-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">svyglm</span>(</span>
@@ -918,7 +918,7 @@ <h2><span class="header-section-number">7.5</span> Exercises<a href="c07-modelin
 </p>
 </div>
 <ol start="4" style="list-style-type: decimal">
-<li>Early voting expanded in 2020<a href="#fn27" class="footnote-ref" id="fnref27"><sup>27</sup></a>. Build a logistic model predicting early voting in 2020 (<code>EarlyVote2020</code>) using age (<code>Age</code>), education (<code>Education</code>), and party identification (<code>PartyID</code>). Include two-way interactions.</li>
+<li>Early voting expanded in 2020<a href="#fn26" class="footnote-ref" id="fnref26"><sup>26</sup></a>. Build a logistic model predicting early voting in 2020 (<code>EarlyVote2020</code>) using age (<code>Age</code>), education (<code>Education</code>), and party identification (<code>PartyID</code>). Include two-way interactions.</li>
 </ol>
 <div class="sourceCode" id="cb329"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb329-1"><a href="c07-modeling.html#cb329-1" aria-hidden="true" tabindex="-1"></a>earlyvote_mod <span class="ot">&lt;-</span> anes_des <span class="sc">%&gt;%</span></span>
 <span id="cb329-2"><a href="c07-modeling.html#cb329-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(EarlyVote2020)) <span class="sc">%&gt;%</span></span>
@@ -981,14 +981,14 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="21">
-<li id="fn21"><p>Use <code>help(formula)</code> in R or find the documentation online at <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html" class="uri">https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html</a><a href="c07-modeling.html#fnref21" class="footnote-back">↩︎</a></p></li>
-<li id="fn22"><p>There is some debate about whether weights should be used in regression <span class="citation">(<a href="#ref-gelman2007weights" role="doc-biblioref">Gelman 2007</a>; <a href="#ref-bollen2016weightsreg" role="doc-biblioref">Bollen et al. 2016</a>)</span>. However, for the purposes of providing complete information on how to analyze complex survey data, this chapter will include weights.<a href="c07-modeling.html#fnref22" class="footnote-back">↩︎</a></p></li>
-<li id="fn23"><p>See <code>help(na.omit)</code> for more information on options to use for <code>na.action</code>. For a discussion of how to handle missing data see Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a>.<a href="c07-modeling.html#fnref23" class="footnote-back">↩︎</a></p></li>
-<li id="fn24"><p>To change the reference level, reorder the factor before modeling using the function <code>relevel()</code> from {stats} or using one of many factor ordering functions in {forcats} such as <code>fct_relevel()</code> or <code>fct_infreq()</code><a href="c07-modeling.html#fnref24" class="footnote-back">↩︎</a></p></li>
-<li id="fn25"><p>Question: How often can you trust the federal government in Washington to do what is right?<a href="c07-modeling.html#fnref25" class="footnote-back">↩︎</a></p></li>
-<li id="fn26"><p><a href="https://www.eia.gov/energyexplained/units-and-calculators/degree-days.php" class="uri">https://www.eia.gov/energyexplained/units-and-calculators/degree-days.php</a><a href="c07-modeling.html#fnref26" class="footnote-back">↩︎</a></p></li>
-<li id="fn27"><p><a href="https://www.npr.org/2020/10/26/927803214/62-million-and-counting-americans-are-breaking-early-voting-records" class="uri">https://www.npr.org/2020/10/26/927803214/62-million-and-counting-americans-are-breaking-early-voting-records</a><a href="c07-modeling.html#fnref27" class="footnote-back">↩︎</a></p></li>
+<ol start="20">
+<li id="fn20"><p>Use <code>help(formula)</code> in R or find the documentation online at <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html" class="uri">https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html</a><a href="c07-modeling.html#fnref20" class="footnote-back">↩︎</a></p></li>
+<li id="fn21"><p>There is some debate about whether weights should be used in regression <span class="citation">(<a href="#ref-gelman2007weights" role="doc-biblioref">Gelman 2007</a>; <a href="#ref-bollen2016weightsreg" role="doc-biblioref">Bollen et al. 2016</a>)</span>. However, for the purposes of providing complete information on how to analyze complex survey data, this chapter will include weights.<a href="c07-modeling.html#fnref21" class="footnote-back">↩︎</a></p></li>
+<li id="fn22"><p>See <code>help(na.omit)</code> for more information on options to use for <code>na.action</code>. For a discussion of how to handle missing data see Chapter <a href="c04-understanding-survey-data-documentation.html#c04-understanding-survey-data-documentation">4</a>.<a href="c07-modeling.html#fnref22" class="footnote-back">↩︎</a></p></li>
+<li id="fn23"><p>To change the reference level, reorder the factor before modeling using the function <code>relevel()</code> from {stats} or using one of many factor ordering functions in {forcats} such as <code>fct_relevel()</code> or <code>fct_infreq()</code><a href="c07-modeling.html#fnref23" class="footnote-back">↩︎</a></p></li>
+<li id="fn24"><p>Question: How often can you trust the federal government in Washington to do what is right?<a href="c07-modeling.html#fnref24" class="footnote-back">↩︎</a></p></li>
+<li id="fn25"><p><a href="https://www.eia.gov/energyexplained/units-and-calculators/degree-days.php" class="uri">https://www.eia.gov/energyexplained/units-and-calculators/degree-days.php</a><a href="c07-modeling.html#fnref25" class="footnote-back">↩︎</a></p></li>
+<li id="fn26"><p><a href="https://www.npr.org/2020/10/26/927803214/62-million-and-counting-americans-are-breaking-early-voting-records" class="uri">https://www.npr.org/2020/10/26/927803214/62-million-and-counting-americans-are-breaking-early-voting-records</a><a href="c07-modeling.html#fnref26" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c08-communicating-results.html b/c08-communicating-results.html
index 7bc0f559..bf07de24 100644
--- a/c08-communicating-results.html
+++ b/c08-communicating-results.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c08-communicating-results" class="section level1 hasAnchor" number="8">
 <h1><span class="header-section-number">Chapter 8</span> Communicating Results<a href="c08-communicating-results.html#c08-communicating-results" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq8" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c08-communicating-results.html#prereq8" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
diff --git a/c09-ncvs-vignette.html b/c09-ncvs-vignette.html
index c12b4a50..51d6b24b 100644
--- a/c09-ncvs-vignette.html
+++ b/c09-ncvs-vignette.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c09-ncvs-vignette" class="section level1 hasAnchor" number="9">
 <h1><span class="header-section-number">Chapter 9</span> National Crime Victimization Survey Vignette<a href="c09-ncvs-vignette.html#c09-ncvs-vignette" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq9" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c09-ncvs-vignette.html#prereq9" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -397,7 +397,7 @@ <h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anch
 <h2><span class="header-section-number">9.1</span> Introduction<a href="c09-ncvs-vignette.html#introduction-7" class="anchor-section" aria-label="Anchor link to header"></a></h2>
 <p>The United States National Crime Victimization Survey (NCVS) is a household survey sponsored by the Bureau of Justice Statistics (BJS), which collects data on criminal victimization, including characteristics of the crimes, offenders, and victims. Both household and personal crimes include violent and non-violent crimes. The target population of this survey is all people in the United States age 12 and older living in housing units and noninstitutional group quarters.</p>
 <p>The NCVS has been ongoing since 1992. An earlier survey, the National Crime Survey, was run from 1972 to 1991 (<span class="citation">Bureau of Justice Statistics (<a href="#ref-ncvs_tech_2016" role="doc-biblioref">2017</a>)</span>). The survey is administered using a rotating panel. When an address enters the sample, the residents of that address are interviewed every six months for a total of seven interviews. If the initial residents move away from the address during the period, the new residents are included in the survey, and people are not followed when they move.</p>
-<p>NCVS data is publicly available and distributed by Inter-university Consortium for Political and Social Research (ICPSR)<a href="#fn28" class="footnote-ref" id="fnref28"><sup>28</sup></a>, with data going back to 1992. The vignette in this book will include data from 2021 (<span class="citation">United States. Bureau of Justice Statistics (<a href="#ref-ncvs_data_2021" role="doc-biblioref">2022</a>)</span>). The NCVS data structure is complicated, and the User’s Guide contains examples for analysis in SAS, SUDAAN, SPSS, and Stata, but not R (<span class="citation">Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (<a href="#ref-ncvs_user_guide" role="doc-biblioref">2015</a>)</span>). This vignette will adapt those examples for R.</p>
+<p>NCVS data is publicly available and distributed by Inter-university Consortium for Political and Social Research (ICPSR)<a href="#fn27" class="footnote-ref" id="fnref27"><sup>27</sup></a>, with data going back to 1992. The vignette in this book will include data from 2021 (<span class="citation">United States. Bureau of Justice Statistics (<a href="#ref-ncvs_data_2021" role="doc-biblioref">2022</a>)</span>). The NCVS data structure is complicated, and the User’s Guide contains examples for analysis in SAS, SUDAAN, SPSS, and Stata, but not R (<span class="citation">Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (<a href="#ref-ncvs_user_guide" role="doc-biblioref">2015</a>)</span>). This vignette will adapt those examples for R.</p>
 </div>
 <div id="data-structure" class="section level2 hasAnchor" number="9.2">
 <h2><span class="header-section-number">9.2</span> Data structure<a href="c09-ncvs-vignette.html#data-structure" class="anchor-section" aria-label="Anchor link to header"></a></h2>
@@ -409,7 +409,7 @@ <h2><span class="header-section-number">9.2</span> Data structure<a href="c09-nc
 <li>Incident Record - <code>YEARQ</code>, <code>IDHH</code>, <code>IDPER</code></li>
 <li>2021 Collection Year Incident - <code>YEARQ</code>, <code>IDHH</code>, <code>IDPER</code></li>
 </ul>
-<p>We will focus on the household, person, and incident files. From these files, we selected a subset of columns for examples to use in this vignette. Download the complete files at ICPSR<a href="#fn29" class="footnote-ref" id="fnref29"><sup>29</sup></a>.</p>
+<p>We will focus on the household, person, and incident files. From these files, we selected a subset of columns for examples to use in this vignette. Download the complete files at ICPSR<a href="#fn28" class="footnote-ref" id="fnref28"><sup>28</sup></a>.</p>
 </div>
 <div id="survey-notation" class="section level2 hasAnchor" number="9.3">
 <h2><span class="header-section-number">9.3</span> Survey notation<a href="c09-ncvs-vignette.html#survey-notation" class="anchor-section" aria-label="Anchor link to header"></a></h2>
@@ -435,7 +435,7 @@ <h2><span class="header-section-number">9.3</span> Survey notation<a href="c09-n
 <p><span class="math display">\[ \hat{p}_{A_a,D} =\frac{\sum_{ijkl \in A_a, D} v_{ijkl}}{\sum_{ijkl \in D} v_{ijkl}}.\]</span>
 The numerator is the number of incidents with a particular characteristic in a domain, and the denominator is the number of incidents in a domain.</p>
 <ol start="3" style="list-style-type: decimal">
-<li><em>Victimization rates</em> are estimates of the number of victimizations per 1,000 persons or households in the population<a href="#fn30" class="footnote-ref" id="fnref30"><sup>30</sup></a>. Victimization rates are calculated using the household or person design objects. The estimated victimization rate for crime <span class="math inline">\(C\)</span> in domain <span class="math inline">\(D\)</span> is</li>
+<li><em>Victimization rates</em> are estimates of the number of victimizations per 1,000 persons or households in the population<a href="#fn29" class="footnote-ref" id="fnref29"><sup>29</sup></a>. Victimization rates are calculated using the household or person design objects. The estimated victimization rate for crime <span class="math inline">\(C\)</span> in domain <span class="math inline">\(D\)</span> is</li>
 </ol>
 <p><span class="math display">\[V\hat{R}_{C,D}= \frac{\sum_{ijkl \in C,D} v_{ijkl}}{\sum_{ijk \in D} w_{ijk}}\times 1000\]</span>
 where <span class="math inline">\(w_{ijk}\)</span> is the person weight (<code>WGTPERCY</code>) or household weight (<code>WGTHHCY</code>) for personal and household crimes, respectively. The numerator is the number of incidents in a domain, and the denominator is the number of persons or households in a domain.</p>
@@ -2698,10 +2698,10 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="28">
-<li id="fn28"><p><a href="https://www.icpsr.umich.edu/web/ICPSR/series/95" class="uri">https://www.icpsr.umich.edu/web/ICPSR/series/95</a><a href="c09-ncvs-vignette.html#fnref28" class="footnote-back">↩︎</a></p></li>
-<li id="fn29"><p><a href="https://www.icpsr.umich.edu/web/NACJD/studies/38429" class="uri">https://www.icpsr.umich.edu/web/NACJD/studies/38429</a><a href="c09-ncvs-vignette.html#fnref29" class="footnote-back">↩︎</a></p></li>
-<li id="fn30"><p>BJS publishes victimization rates per 1,000 which are also presented in these examples<a href="c09-ncvs-vignette.html#fnref30" class="footnote-back">↩︎</a></p></li>
+<ol start="27">
+<li id="fn27"><p><a href="https://www.icpsr.umich.edu/web/ICPSR/series/95" class="uri">https://www.icpsr.umich.edu/web/ICPSR/series/95</a><a href="c09-ncvs-vignette.html#fnref27" class="footnote-back">↩︎</a></p></li>
+<li id="fn28"><p><a href="https://www.icpsr.umich.edu/web/NACJD/studies/38429" class="uri">https://www.icpsr.umich.edu/web/NACJD/studies/38429</a><a href="c09-ncvs-vignette.html#fnref28" class="footnote-back">↩︎</a></p></li>
+<li id="fn29"><p>BJS publishes victimization rates per 1,000 which are also presented in these examples<a href="c09-ncvs-vignette.html#fnref29" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/c10-ambarom-vignette.html b/c10-ambarom-vignette.html
index 0198741c..166f81ef 100644
--- a/c10-ambarom-vignette.html
+++ b/c10-ambarom-vignette.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -378,8 +378,8 @@ <h1>
             <section class="normal" id="section-">
 <div id="c10-ambarom-vignette" class="section level1 hasAnchor" number="10">
 <h1><span class="header-section-number">Chapter 10</span> AmericasBarometer Vignette<a href="c10-ambarom-vignette.html#c10-ambarom-vignette" class="anchor-section" aria-label="Anchor link to header"></a></h1>
-<div id="prereq" class="section level3 unnumbered hasAnchor prereqbox-header">
-<h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anchor-section" aria-label="Anchor link to header"></a></h3>
+<div id="prereq10" class="section level3 unnumbered hasAnchor prereqbox-header">
+<h3>Prerequisites<a href="c10-ambarom-vignette.html#prereq10" class="anchor-section" aria-label="Anchor link to header"></a></h3>
 </div>
 <div class="prereqbox">
 <p>For this chapter, here are the libraries and helper functions we will need:</p>
@@ -397,13 +397,13 @@ <h3>Prerequisites<a href="c03-specifying-sample-designs.html#prereq" class="anch
 </div>
 <div id="introduction-8" class="section level2 hasAnchor" number="10.1">
 <h2><span class="header-section-number">10.1</span> Introduction<a href="c10-ambarom-vignette.html#introduction-8" class="anchor-section" aria-label="Anchor link to header"></a></h2>
-<p>The AmericasBarometer surveys are conducted by the LAPOP Lab. These surveys are public opinion surveys of the Americas focused on democracy. The study was launched in 2004/2005 with 11 countries, with the countries growing and fluctuating over time, and creates a study with consistent methodology across many countries. In 2021, the study included 22 countries ranging from the north in Canada to the South in Chile and Argentina<a href="#fn31" class="footnote-ref" id="fnref31"><sup>31</sup></a>.</p>
-<p>Historically, surveys were administered with face-to-face household interviews, but the COVID-19 pandemic changed the study significantly to the use of random-digit dialing (RDD) of mobile phones in all countries except the United States and Canada<a href="#fn32" class="footnote-ref" id="fnref32"><sup>32</sup></a>. In Canada, LAPOP collaborated with the Environics Institute to collect data from a panel of Canadians using a web survey<a href="#fn33" class="footnote-ref" id="fnref33"><sup>33</sup></a>. While in the United States, YouGov conducted the survey on behalf of LAPOP by conducting a web survey among their panelists<a href="#fn34" class="footnote-ref" id="fnref34"><sup>34</sup></a>.</p>
-<p>The survey has a core set of questions across the countries, but not all questions are asked everywhere. Additionally, some questions are only asked to half of the respondents within a country, presumably to reduce the burden as different sections are randomized to different respondents.<a href="#fn35" class="footnote-ref" id="fnref35"><sup>35</sup></a></p>
+<p>The AmericasBarometer surveys are conducted by the LAPOP Lab. These surveys are public opinion surveys of the Americas focused on democracy. The study was launched in 2004/2005 with 11 countries, with the countries growing and fluctuating over time, and creates a study with consistent methodology across many countries. In 2021, the study included 22 countries ranging from the north in Canada to the South in Chile and Argentina<a href="#fn30" class="footnote-ref" id="fnref30"><sup>30</sup></a>.</p>
+<p>Historically, surveys were administered with face-to-face household interviews, but the COVID-19 pandemic changed the study significantly to the use of random-digit dialing (RDD) of mobile phones in all countries except the United States and Canada<a href="#fn31" class="footnote-ref" id="fnref31"><sup>31</sup></a>. In Canada, LAPOP collaborated with the Environics Institute to collect data from a panel of Canadians using a web survey<a href="#fn32" class="footnote-ref" id="fnref32"><sup>32</sup></a>. While in the United States, YouGov conducted the survey on behalf of LAPOP by conducting a web survey among their panelists<a href="#fn33" class="footnote-ref" id="fnref33"><sup>33</sup></a>.</p>
+<p>The survey has a core set of questions across the countries, but not all questions are asked everywhere. Additionally, some questions are only asked to half of the respondents within a country, presumably to reduce the burden as different sections are randomized to different respondents.<a href="#fn34" class="footnote-ref" id="fnref34"><sup>34</sup></a></p>
 </div>
 <div id="data-structure-1" class="section level2 hasAnchor" number="10.2">
 <h2><span class="header-section-number">10.2</span> Data Structure<a href="c10-ambarom-vignette.html#data-structure-1" class="anchor-section" aria-label="Anchor link to header"></a></h2>
-<p>Each country and each year has its own files. The data used in this vignette can be downloaded from the LAPOP website. In this vignette, we will be using data from 2021, namely version v1.2. These are not available on the book’s repository, but you may download the raw files yourself<a href="#fn36" class="footnote-ref" id="fnref36"><sup>36</sup></a> (<span class="citation"><span>“The AmericasBarometer by the LAPOP Lab”</span> (<a href="#ref-lapopdat" role="doc-biblioref">2023</a>)</span>). To read all files into R and ignore the Stata labels, we recommend running code like this:</p>
+<p>Each country and each year has its own files. The data used in this vignette can be downloaded from the LAPOP website. In this vignette, we will be using data from 2021, namely version v1.2. These are not available on the book’s repository, but you may download the raw files yourself<a href="#fn35" class="footnote-ref" id="fnref35"><sup>35</sup></a> (<span class="citation"><span>“The AmericasBarometer by the LAPOP Lab”</span> (<a href="#ref-lapopdat" role="doc-biblioref">2023</a>)</span>). To read all files into R and ignore the Stata labels, we recommend running code like this:</p>
 <div class="sourceCode" id="cb423"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb423-1"><a href="c10-ambarom-vignette.html#cb423-1" aria-hidden="true" tabindex="-1"></a>stata_files <span class="ot">&lt;-</span> <span class="fu">list.files</span>(<span class="fu">here</span>(<span class="st">&quot;RawData&quot;</span>, <span class="st">&quot;LAPOP_2021&quot;</span>), <span class="st">&quot;*.dta&quot;</span>)</span>
 <span id="cb423-2"><a href="c10-ambarom-vignette.html#cb423-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb423-3"><a href="c10-ambarom-vignette.html#cb423-3" aria-hidden="true" tabindex="-1"></a>read_stata_unlabeled <span class="ot">&lt;-</span> <span class="cf">function</span>(file) {</span>
@@ -414,7 +414,7 @@ <h2><span class="header-section-number">10.2</span> Data Structure<a href="c10-a
 <span id="cb423-8"><a href="c10-ambarom-vignette.html#cb423-8" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb423-9"><a href="c10-ambarom-vignette.html#cb423-9" aria-hidden="true" tabindex="-1"></a>lapop_in <span class="ot">&lt;-</span> <span class="fu">here</span>(<span class="st">&quot;RawData&quot;</span>, <span class="st">&quot;LAPOP_2021&quot;</span>, stata_files) <span class="sc">%&gt;%</span></span>
 <span id="cb423-10"><a href="c10-ambarom-vignette.html#cb423-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">map_df</span>(read_stata_unlabeled)</span></code></pre></div>
-<p>The code above will read all files of type <code>dta</code> in and stack them into one tibble. We did this and then selected a subset of variables for this vignette. To understand variables that are used across the several countries, the core questionnaire is useful.<a href="#fn37" class="footnote-ref" id="fnref37"><sup>37</sup></a></p>
+<p>The code above will read all files of type <code>dta</code> in and stack them into one tibble. We did this and then selected a subset of variables for this vignette. To understand variables that are used across the several countries, the core questionnaire is useful.<a href="#fn36" class="footnote-ref" id="fnref36"><sup>36</sup></a></p>
 </div>
 <div id="preparing-files" class="section level2 hasAnchor" number="10.3">
 <h2><span class="header-section-number">10.3</span> Preparing files<a href="c10-ambarom-vignette.html#preparing-files" class="anchor-section" aria-label="Anchor link to header"></a></h2>
@@ -662,7 +662,7 @@ <h2><span class="header-section-number">10.3</span> Preparing files<a href="c10-
 </div>
 <div id="survey-design-objects-1" class="section level2 hasAnchor" number="10.4">
 <h2><span class="header-section-number">10.4</span> Survey design objects<a href="c10-ambarom-vignette.html#survey-design-objects-1" class="anchor-section" aria-label="Anchor link to header"></a></h2>
-<p>The technical report is the best source to understand how to specify the sampling design in R<a href="#fn38" class="footnote-ref" id="fnref38"><sup>38</sup></a>. The data includes two weights: <code>wt</code> and <code>weight1500</code>. The first weight variable is country-specific and sums to the sample size but is calibrated to reflect each country’s demographics, while the second weight variable sums to 1500 for each country. The second weight is indicated as the weight to use for multi-country analyses. While the documentation does not directly state this, the example Stata syntax <code>svyset upm [pw=weight1500], strata(strata)</code> indicates the variable <code>upm</code> is a clustering variable, and <code>strata</code> is the strata variable. The design object is setup as follows:</p>
+<p>The technical report is the best source to understand how to specify the sampling design in R<a href="#fn37" class="footnote-ref" id="fnref37"><sup>37</sup></a>. The data includes two weights: <code>wt</code> and <code>weight1500</code>. The first weight variable is country-specific and sums to the sample size but is calibrated to reflect each country’s demographics, while the second weight variable sums to 1500 for each country. The second weight is indicated as the weight to use for multi-country analyses. While the documentation does not directly state this, the example Stata syntax <code>svyset upm [pw=weight1500], strata(strata)</code> indicates the variable <code>upm</code> is a clustering variable, and <code>strata</code> is the strata variable. The design object is setup as follows:</p>
 <div class="sourceCode" id="cb444"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb444-1"><a href="c10-ambarom-vignette.html#cb444-1" aria-hidden="true" tabindex="-1"></a>ambarom_des <span class="ot">&lt;-</span> ambarom <span class="sc">%&gt;%</span></span>
 <span id="cb444-2"><a href="c10-ambarom-vignette.html#cb444-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">as_survey_design</span>(<span class="at">ids =</span> upm,</span>
 <span id="cb444-3"><a href="c10-ambarom-vignette.html#cb444-3" aria-hidden="true" tabindex="-1"></a>                   <span class="at">strata =</span> strata,</span>
@@ -671,7 +671,7 @@ <h2><span class="header-section-number">10.4</span> Survey design objects<a href
 </div>
 <div id="calculating-estimates-and-making-tables" class="section level2 hasAnchor" number="10.5">
 <h2><span class="header-section-number">10.5</span> Calculating estimates and making tables<a href="c10-ambarom-vignette.html#calculating-estimates-and-making-tables" class="anchor-section" aria-label="Anchor link to header"></a></h2>
-<p>This survey was administered in 2021 between March and August, varying by country<a href="#fn39" class="footnote-ref" id="fnref39"><sup>39</sup></a>. Given the state of the pandemic at that time, several questions about COVID were included. The first question about COVID asked whether people were worried about the possibility that they or someone in their household will get sick from coronavirus in the next three months. We will calculate the percentage of people in each country who are very worried or somewhat worried.</p>
+<p>This survey was administered in 2021 between March and August, varying by country<a href="#fn38" class="footnote-ref" id="fnref38"><sup>38</sup></a>. Given the state of the pandemic at that time, several questions about COVID were included. The first question about COVID asked whether people were worried about the possibility that they or someone in their household will get sick from coronavirus in the next three months. We will calculate the percentage of people in each country who are very worried or somewhat worried.</p>
 <p>In the following code, estimates are calculated, and then a table of the estimates is created using the {{gt}} package.</p>
 <div class="sourceCode" id="cb445"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb445-1"><a href="c10-ambarom-vignette.html#cb445-1" aria-hidden="true" tabindex="-1"></a>covid_worry_country_ests <span class="ot">&lt;-</span></span>
 <span id="cb445-2"><a href="c10-ambarom-vignette.html#cb445-2" aria-hidden="true" tabindex="-1"></a>  ambarom_des <span class="sc">%&gt;%</span></span>
@@ -2105,16 +2105,16 @@ <h3>References<a href="references.html#references" class="anchor-section" aria-l
 </div>
 <div class="footnotes">
 <hr />
-<ol start="31">
-<li id="fn31"><p><a href="https://www.vanderbilt.edu/lapop/about-americasbarometer.php" class="uri">https://www.vanderbilt.edu/lapop/about-americasbarometer.php</a><a href="c10-ambarom-vignette.html#fnref31" class="footnote-back">↩︎</a></p></li>
-<li id="fn32"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a><a href="c10-ambarom-vignette.html#fnref32" class="footnote-back">↩︎</a></p></li>
-<li id="fn33"><p><a href="http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf" class="uri">http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf</a><a href="c10-ambarom-vignette.html#fnref33" class="footnote-back">↩︎</a></p></li>
-<li id="fn34"><p><a href="http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf" class="uri">http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf</a><a href="c10-ambarom-vignette.html#fnref34" class="footnote-back">↩︎</a></p></li>
-<li id="fn35"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf</a><a href="c10-ambarom-vignette.html#fnref35" class="footnote-back">↩︎</a></p></li>
-<li id="fn36"><p><a href="http://datasets.americasbarometer.org/database/index.php" class="uri">http://datasets.americasbarometer.org/database/index.php</a><a href="c10-ambarom-vignette.html#fnref36" class="footnote-back">↩︎</a></p></li>
-<li id="fn37"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf</a><a href="c10-ambarom-vignette.html#fnref37" class="footnote-back">↩︎</a></p></li>
-<li id="fn38"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a><a href="c10-ambarom-vignette.html#fnref38" class="footnote-back">↩︎</a></p></li>
-<li id="fn39"><p>See Table 2 in <a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a> for dates by country<a href="c10-ambarom-vignette.html#fnref39" class="footnote-back">↩︎</a></p></li>
+<ol start="30">
+<li id="fn30"><p><a href="https://www.vanderbilt.edu/lapop/about-americasbarometer.php" class="uri">https://www.vanderbilt.edu/lapop/about-americasbarometer.php</a><a href="c10-ambarom-vignette.html#fnref30" class="footnote-back">↩︎</a></p></li>
+<li id="fn31"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a><a href="c10-ambarom-vignette.html#fnref31" class="footnote-back">↩︎</a></p></li>
+<li id="fn32"><p><a href="http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf" class="uri">http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf</a><a href="c10-ambarom-vignette.html#fnref32" class="footnote-back">↩︎</a></p></li>
+<li id="fn33"><p><a href="http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf" class="uri">http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf</a><a href="c10-ambarom-vignette.html#fnref33" class="footnote-back">↩︎</a></p></li>
+<li id="fn34"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf</a><a href="c10-ambarom-vignette.html#fnref34" class="footnote-back">↩︎</a></p></li>
+<li id="fn35"><p><a href="http://datasets.americasbarometer.org/database/index.php" class="uri">http://datasets.americasbarometer.org/database/index.php</a><a href="c10-ambarom-vignette.html#fnref35" class="footnote-back">↩︎</a></p></li>
+<li id="fn36"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf</a><a href="c10-ambarom-vignette.html#fnref36" class="footnote-back">↩︎</a></p></li>
+<li id="fn37"><p><a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a><a href="c10-ambarom-vignette.html#fnref37" class="footnote-back">↩︎</a></p></li>
+<li id="fn38"><p>See Table 2 in <a href="https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf" class="uri">https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf</a> for dates by country<a href="c10-ambarom-vignette.html#fnref38" class="footnote-back">↩︎</a></p></li>
 </ol>
 </div>
             </section>
diff --git a/css/style.css b/css/style.css
index 977db557..3320a86a 100644
--- a/css/style.css
+++ b/css/style.css
@@ -55,7 +55,7 @@ li.ro::marker{
   border-top-right-radius: 10px;
 }
 
-h3.hasAnchor#prereq {
+h3.hasAnchor#prereq3 #prereq4 #prereq5 #prereq6 #prereq7 #prereq8 #prereq9 #prereq10 {
   margin-top: 0em !important;
   margin-bottom: 0em !important;
 }
diff --git a/index.html b/index.html
index 80447aa9..1a531dad 100644
--- a/index.html
+++ b/index.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
diff --git a/reference-keys.txt b/reference-keys.txt
index debdd34f..3ebd820d 100644
--- a/reference-keys.txt
+++ b/reference-keys.txt
@@ -1,8 +1,7 @@
 tab:apidata
-fig:que-examp
-fig:que-examp-2
-fig:codebook-examp
-tab:desc-summary-tab
+fig:understand-que-examp
+fig:understand-que-examp-2
+fig:understand-codebook-examp
 fig:stattest-chi-ex1-graph
 fig:model-plot-sf-elbill
 fig:model-aug-examp-plot
diff --git a/references.html b/references.html
index 53fd96cc..d676efcd 100644
--- a/references.html
+++ b/references.html
@@ -185,7 +185,7 @@
 </ul></li>
 <li class="chapter" data-level="3" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html"><i class="fa fa-check"></i><b>3</b> Specifying sample designs and replicate weights in {srvyr}</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq3"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="3.1" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
 <li class="chapter" data-level="3.2" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#common-sampling-designs"><i class="fa fa-check"></i><b>3.2</b> Common sampling designs</a>
 <ul>
@@ -206,7 +206,7 @@
 </ul></li>
 <li class="chapter" data-level="4" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html"><i class="fa fa-check"></i><b>4</b> Understanding survey data documentation</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#prereq4"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="4.1" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#introduction-1"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
 <li class="chapter" data-level="4.2" data-path="c04-understanding-survey-data-documentation.html"><a href="c04-understanding-survey-data-documentation.html#types-of-survey-documentation"><i class="fa fa-check"></i><b>4.2</b> Types of survey documentation</a>
 <ul>
@@ -226,7 +226,7 @@
 </ul></li>
 <li class="chapter" data-level="5" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html"><i class="fa fa-check"></i><b>5</b> Descriptive analyses in srvyr</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#prereq5"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="5.1" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#introduction-3"><i class="fa fa-check"></i><b>5.1</b> Introduction</a></li>
 <li class="chapter" data-level="5.2" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#similarities-between-dplyr-and-srvyr-functions"><i class="fa fa-check"></i><b>5.2</b> Similarities between {dplyr} and {srvyr} functions</a></li>
 <li class="chapter" data-level="5.3" data-path="c05-descriptive-analysis.html"><a href="c05-descriptive-analysis.html#deciding-on-descriptive-analyses"><i class="fa fa-check"></i><b>5.3</b> Deciding on descriptive analyses</a></li>
@@ -263,7 +263,7 @@
 </ul></li>
 <li class="chapter" data-level="6" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html"><i class="fa fa-check"></i><b>6</b> Statistical testing</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#prereq6"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="6.1" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#introduction-4"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
 <li class="chapter" data-level="6.2" data-path="c06-statistical-testing.html"><a href="c06-statistical-testing.html#stattest-ttest"><i class="fa fa-check"></i><b>6.2</b> Comparison of Proportions and Means</a>
 <ul>
@@ -279,7 +279,7 @@
 </ul></li>
 <li class="chapter" data-level="7" data-path="c07-modeling.html"><a href="c07-modeling.html"><i class="fa fa-check"></i><b>7</b> Modeling</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c07-modeling.html"><a href="c07-modeling.html#prereq7"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="7.1" data-path="c07-modeling.html"><a href="c07-modeling.html#introduction-5"><i class="fa fa-check"></i><b>7.1</b> Introduction</a></li>
 <li class="chapter" data-level="7.2" data-path="c07-modeling.html"><a href="c07-modeling.html#analysis-of-variance-anova"><i class="fa fa-check"></i><b>7.2</b> Analysis of Variance (ANOVA)</a>
 <ul>
@@ -300,7 +300,7 @@
 </ul></li>
 <li class="chapter" data-level="8" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html"><i class="fa fa-check"></i><b>8</b> Communicating Results</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#prereq8"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="8.1" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#introduction-6"><i class="fa fa-check"></i><b>8.1</b> Introduction</a></li>
 <li class="chapter" data-level="8.2" data-path="c08-communicating-results.html"><a href="c08-communicating-results.html#describing-results-through-text"><i class="fa fa-check"></i><b>8.2</b> Describing Results through Text</a>
 <ul>
@@ -324,7 +324,7 @@
 </ul></li>
 <li class="chapter" data-level="9" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html"><i class="fa fa-check"></i><b>9</b> National Crime Victimization Survey Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#prereq9"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="9.1" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#introduction-7"><i class="fa fa-check"></i><b>9.1</b> Introduction</a></li>
 <li class="chapter" data-level="9.2" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#data-structure"><i class="fa fa-check"></i><b>9.2</b> Data structure</a></li>
 <li class="chapter" data-level="9.3" data-path="c09-ncvs-vignette.html"><a href="c09-ncvs-vignette.html#survey-notation"><i class="fa fa-check"></i><b>9.3</b> Survey notation</a></li>
@@ -345,7 +345,7 @@
 </ul></li>
 <li class="chapter" data-level="10" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html"><i class="fa fa-check"></i><b>10</b> AmericasBarometer Vignette</a>
 <ul>
-<li class="chapter" data-level="" data-path="c03-specifying-sample-designs.html"><a href="c03-specifying-sample-designs.html#prereq"><i class="fa fa-check"></i>Prerequisites</a></li>
+<li class="chapter" data-level="" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#prereq10"><i class="fa fa-check"></i>Prerequisites</a></li>
 <li class="chapter" data-level="10.1" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#introduction-8"><i class="fa fa-check"></i><b>10.1</b> Introduction</a></li>
 <li class="chapter" data-level="10.2" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#data-structure-1"><i class="fa fa-check"></i><b>10.2</b> Data Structure</a></li>
 <li class="chapter" data-level="10.3" data-path="c10-ambarom-vignette.html"><a href="c10-ambarom-vignette.html#preparing-files"><i class="fa fa-check"></i><b>10.3</b> Preparing files</a></li>
@@ -390,18 +390,21 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 ———. 2022. <span>“ANES 2020 Time Series Study Full Release: User Guide and Codebook.”</span> <a href="https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf">https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf</a>.
 </div>
 <div class="csl-entry">
-Biemer, Paul P. 2010. <span>“<span class="nocase">Total Survey Error: Design, Implementation, and Evaluation</span>.”</span> <em>Public Opinion Quarterly</em> 74 (5): 817–48. <a href="https://doi.org/10.1093/poq/nfq058">https://doi.org/10.1093/poq/nfq058</a>.
+Biemer, Paul P. 2010. <span>“Total Survey Error: Design, Implementation, and Evaluation.”</span> <em>Public Opinion Quarterly</em> 74 (5): 817–48. <a href="https://doi.org/10.1093/poq/nfq058">https://doi.org/10.1093/poq/nfq058</a>.
 </div>
 <div class="csl-entry">
-Biemer, Paul P, and Lars E Lyberg. 2003. <em>Introduction to Survey Quality</em>. John Wiley &amp; Sons.
+Biemer, Paul P., and Lars E. Lyberg. 2003. <em>Introduction to Survey Quality</em>. John Wiley &amp; Sons.
 </div>
 <div class="csl-entry">
-Biemer, Paul P, Joe Murphy, Stephanie Zimmer, Chip Berry, Grace Deng, and Katie Lewis. 2017. <span>“<span class="nocase">Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys</span>.”</span> <em>Journal of Survey Statistics and Methodology</em> 6 (2): 240–61. <a href="https://doi.org/10.1093/jssam/smx015">https://doi.org/10.1093/jssam/smx015</a>.
+Biemer, Paul P., Joe Murphy, Stephanie Zimmer, Chip Berry, Grace Deng, and Katie Lewis. 2017. <span>“<span class="nocase">Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys</span>.”</span> <em>Journal of Survey Statistics and Methodology</em> 6 (2): 240–61. <a href="https://doi.org/10.1093/jssam/smx015">https://doi.org/10.1093/jssam/smx015</a>.
 </div>
 <div class="csl-entry">
 Bollen, Kenneth A., Paul P. Biemer, Alan F. Karr, Stephen Tueller, and Marcus E. Berzofsky. 2016. <span>“Are Survey Weights Needed? A Review of Diagnostic Tests in Regression Analysis.”</span> <em>Annual Review of Statistics and Its Application</em> 3 (1): 375–92. <a href="https://doi.org/10.1146/annurev-statistics-011516-012958">https://doi.org/10.1146/annurev-statistics-011516-012958</a>.
 </div>
 <div class="csl-entry">
+Bradburn, Norman M., Seymour Sudman, and Brian Wansink. 2004. <em>Asking Questions: The Definitive Guide to Questionnaire Design</em>. 2nd Edition. Jossey-Bass.
+</div>
+<div class="csl-entry">
 Bureau of Justice Statistics. 2017. <span>“National <span>Crime</span> <span>Victimization</span> <span>Survey</span>, 2016: <span>Technical</span> <span>Documentation</span>.”</span> <a href="https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvstd16.pdf">https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvstd16.pdf</a>.
 </div>
 <div class="csl-entry">
@@ -417,7 +420,10 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 DeBell, Matthew. 2010. <span>“How to Analyze ANES Survey Data.”</span> ANES Technical Report Series nes012492. Palo Alto, CA: Stanford University; Ann Arbor, MI: the University of Michigan. <a href="https://electionstudies.org/wp-content/uploads/2018/05/HowToAnalyzeANESData.pdf">https://electionstudies.org/wp-content/uploads/2018/05/HowToAnalyzeANESData.pdf</a>.
 </div>
 <div class="csl-entry">
-DeLeeuw, Edith D. 2018. <span>“Mixed-Mode: Past, Present, and Future.”</span> <em>Survey Research Methods</em> 12 (2): 75–89. <a href="https://doi.org/10.18148/srm/2018.v12i2.7402">https://doi.org/10.18148/srm/2018.v12i2.7402</a>.
+DeLeeuw, Edith D. 2005. <span>“To Mix or Not to Mix Data Collection Modes in Surveys.”</span> <em>Journal of Official Statistics</em> 21: 233–55.
+</div>
+<div class="csl-entry">
+———. 2018. <span>“Mixed-Mode: Past, Present, and Future.”</span> <em>Survey Research Methods</em> 12 (2): 75–89. <a href="https://doi.org/10.18148/srm/2018.v12i2.7402">https://doi.org/10.18148/srm/2018.v12i2.7402</a>.
 </div>
 <div class="csl-entry">
 Deming, W Edwards. 1991. <em>Sample Design in Business Research</em>. Vol. 23. John Wiley &amp; Sons.
@@ -426,6 +432,9 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 Dillman, Don A, Jolene D Smyth, and Leah Melani Christian. 2014. <em>Internet, Phone, Mail, and Mixed-Mode Surveys: The Tailored Design Method</em>. John Wiley &amp; Sons.
 </div>
 <div class="csl-entry">
+Fowler, Floyd J, and Thomas W. Mangione. 1989. <em>Standardized Survey Interviewing</em>. SAGE.
+</div>
+<div class="csl-entry">
 Fuller, Wayne A. 2011. <em>Sampling Statistics</em>. John Wiley &amp; Sons.
 </div>
 <div class="csl-entry">
@@ -438,6 +447,9 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 Harter, Rachel, Michael P Battaglia, Trent D Buskirk, Don A Dillman, Ned English, Mansour Fahimi, Martin R Frankel, et al. 2016. <span>“Address-Based Sampling.”</span> Task force report. American Association for Public Opinion Research. <a href="https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf">https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf</a>.
 </div>
 <div class="csl-entry">
+Kim, Jae Kwang, and Jun Shao. 2021. <em>Statistical Methods for Handling Incomplete Data</em>. Chapman &amp; Hall/CRC Press.
+</div>
+<div class="csl-entry">
 Levy, Paul S, and Stanley Lemeshow. 2013. <em>Sampling of Populations: Methods and Applications</em>. John Wiley &amp; Sons.
 </div>
 <div class="csl-entry">
@@ -456,15 +468,27 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 Schafer, Joseph L, and John W Graham. 2002. <span>“Missing Data: Our View of the State of the Art.”</span> <em>Psychological Methods</em> 7: 147–77. <a href="https://doi.org/10.1037//1082-989X.7.2.147">https://doi.org/10.1037//1082-989X.7.2.147</a>.
 </div>
 <div class="csl-entry">
+Schouten, Barry, Andy Peytchev, and James Wagner. 2018. <em>Adaptive Survey Design</em>. Chapman &amp; Hall/CRC Press.
+</div>
+<div class="csl-entry">
 Shah, Babubhai V, and Akhil K Vaish. 2006. <span>“Confidence Intervals for Quantile Estimation from Complex Survey Data.”</span> In <em>Proceedings of the Section on Survey Research Methods</em>.
 </div>
 <div class="csl-entry">
 Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus. 2015. <span>“Users’ Guide to the <span>National</span> <span>Crime</span> <span>Victimization</span> <span>Survey</span> (<span>NCVS</span>) Direct Variance Estimation.”</span> Bureau of Justice Statistics. <a href="https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvs_variance_user_guide_11.06.14.pdf">https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvs_variance_user_guide_11.06.14.pdf</a>.
 </div>
 <div class="csl-entry">
+Skinner, Chris. 2009. <span>“Chapter 15: Statistical Disclosure Control for Survey Data.”</span> In <em>Handbook of Statistics: Sample Surveys: Design, Methods and Applications</em>, edited by C. R. Rao, 381–96. Elsevier B.V.
+</div>
+<div class="csl-entry">
 <span>“The AmericasBarometer by the LAPOP Lab.”</span> 2023. <a href="https://www.vanderbilt.edu/lapop">www.vanderbilt.edu/lapop</a>.
 </div>
 <div class="csl-entry">
+Tourangeau, Roger, Mick P. Couper, and Frederick Conrad. 2004. <span>“Sapcing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions.”</span> <em>Public Opinion Quarterly</em> 68: 368–93. <a href="http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp">http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp</a>.
+</div>
+<div class="csl-entry">
+Tourangeau, Roger, Lance J. Rips, and Kenneth Rasinski. 2000. <em>Psychology of Survey Response</em>. Cambridge University Press.
+</div>
+<div class="csl-entry">
 United States. Bureau of Justice Statistics. 2022. <span>“National <span>Crime</span> <span>Victimization</span> <span>Survey</span>, [<span>United</span> <span>States</span>], 2021.”</span> Inter-university Consortium for Political; Social Research [distributor]. <a href="https://doi.org/10.3886/ICPSR38429.v1">https://doi.org/10.3886/ICPSR38429.v1</a>.
 </div>
 <div class="csl-entry">
@@ -477,6 +501,9 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 U.S. Energy Information Administration. 2017. <span>“Residential Energy Consumption Survey (RECS): Using the 2015 Microdata File to Compute Estimates and Standard Errors (RSEs).”</span> <a href="https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf">https://www.eia.gov/consumption/residential/data/2015/pdf/microdata_v3.pdf</a>.
 </div>
 <div class="csl-entry">
+Valliant, Richard, and Jill A. Dever. 2018. <em>Survey Weights: A Step-by-Step Guide to Calculation</em>. Stata Press.
+</div>
+<div class="csl-entry">
 Valliant, Richard, Jill A Dever, and Frauke Kreuter. 2013. <em>Practical Tools for Designing and Weighting Survey Samples</em>. Vol. 1. Springer.
 </div>
 <div class="csl-entry">
@@ -528,7 +555,6 @@ <h1>References<a href="references.html#references" class="anchor-section" aria-l
 
 
 
-
 
 
             </section>
diff --git a/search_index.json b/search_index.json
index 98695fd6..e166402e 100644
--- a/search_index.json
+++ b/search_index.json
@@ -1 +1 @@
-[["index.html", "Tidy Survey Book Preface", " Tidy Survey Book Stephanie Zimmer, Rebecca J. Powell, and Isabella Velásquez 2023-08-07 Preface "],["c01-intro.html", "Chapter 1 Introduction", " Chapter 1 Introduction Surveys are used to gather information about a population. These are often used by researchers, government agencies, and companies to better understand public opinion and behavior. For example, researchers at a non-profit might be interested in public opinion on a given topic, government agencies may be interested in behaviors to inform policy, or companies may survey potential consumers about their interests in a given product. Developing and fielding a survey is one way that organizations can gather information on their interests. This book focuses on how to analyze the data collected from a survey. Our aim is to provide a comprehensive guide for individuals new to survey analysis but already have some background in statistics and R programming. In this book, we will discuss descriptive analysis, statistical testing, and modeling, as well as some best practices in coding and how to present results. We use real data and provide realistic examples to help you gain proficiency in survey analysis. Specially, for the purposes of this book, we assume that you have survey data that has already been collected and weighted, with the most common scenario being that you are using a public use microdata file. Microdata is data that has been released and includes individual responses to surveys. Most survey microdata distributed publicly includes analysis weights and design variables. These variables are included with the data to correctly calculate estimates. Using these weights is necessary for results to be unbiased. Accounting for the sampling design is also required to calculate correct variance estimates and test statistics. However, as we will discuss in Chapter 3, these calculations can be complex. Several general-purpose statistical software have functions to correctly account for these features in analysis, including SAS, Stata, SUDAAN, and R. This book will use R and a combination of both the {survey} and {srvyr} packages and is meant for people who already have experience in R, namely in using the tidyverse. In 2003, the {survey} package was released on CRAN and has been continuously developed over time1. This package, primarily developed by Thomas Lumley, is extensive and includes the following features: Estimates of point estimates and their associated variances, including means, totals, ratios, quantiles, and proportions Estimation of regression models, including generalized linear models, log-linear models, and survival curves Variances by Taylor linearization or by replicate weights (BRR, jackknife, bootstrap, multistage bootstrap, or user-supplied) Hypothesis testing for means, proportions, and more The {srvyr} package in R builds on the {survey} package. It provides wrappers for functions that align with the tidyverse philosophy, which is our motivation for using and recommending this package. We believe it is easy to use for R users who already use {tidyverse} packages. For example, variables to many functions in the {survey} package are passed as formulas, but in the {srvyr} package, variable names are passed using tidy select2 (a common feature in the tidyverse). Users of the tidyverse are most likely familiar with the magittr pipe (%&gt;%) which will seemlessly work with functions from the {srvyr} package. Additionally, several common functions from {dplyr} can be applied to survey objects including filter(), mutate(), and summarize(). There is one limitation to the {srvyr} package, the modeling functionality of the {survey} package is not ported over to tidy versions in the {srvyr} package. This book will use the {survey} package when discussing modeling and hypothesis testing, however, we will provide information on how to still pipe together the functions to make analyses that are easy to follow. What not to expect in this book: Survey methodology - we only provide a primer on methodology Statistical theory - we only provide basic formulas throughout Weighting - we assume you are using an analysis-ready data file with weights What to expect in this book: Chapter 2: An overview of surveys and the process of designing surveys. This is only an overview, and we include many references to get more in-depth knowledge. Chapter 3: Specifying sampling designs. Descriptions of common sampling designs, when they are used, the math behind the mean and standard error estimates, how to specify the designs in R, and examples using real data. Chapter 4: Understanding survey documentation. How to read the various components of survey documentation, working with missing data, and finding the documentation. Chapter 5: Descriptive analyses. Calculating point estimates along with their standard errors, confidence intervals, and design effects. Chapter 6: Statistical testing. Testing for differences between groups, including comparisons of means and proportions as well as goodness of fit tests, tests of independence, and tests of homogeneity. Chapter 7: Modeling. Linear regression, ANOVA, and logistic regression. Chapter 8: Communicating results. Describing results, reproducibility, making publishable tables and graphs, and helpful functions. Chapter 9: National Crime Victimization Survey Vignette. A vignette on how to analyze data from the NCVS, a survey in the US that collects information on crimes and their characteristics. This illustrates an analysis that requires multiple files to calculate victimization rates. Chapter 10: AmericasBarometer Vignette. A vignette on how to analyze data from the AmericasBarometer, a survey of attitudes, evaluations, experiences, and behavior in countries in the Western Hemisphere. This includes how to make choropleth maps with survey estimates. Most chapters include code to follow along with. Each chapter with this code will start with a set-up section. This will include all of the code needed to load packages and datasets to use in the chapter. We then provide an overview of the topic along with examples on how to use the functions. Most chapters end with exercises to work through. Solutions to these exercises can be found online at [XX]. https://cran.r-project.org/src/contrib/Archive/survey/↩︎ https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html↩︎ "],["c02-overview-surveys.html", "Chapter 2 Overview of Surveys 2.1 Pre-Survey Planning 2.2 Study Design 2.3 Data Collection 2.4 Post-Survey Processing 2.5 Post-survey data analysis and reporting", " Chapter 2 Overview of Surveys Developing surveys to gather accurate information about populations involves a more intricate and time-intensive process compared to surveys that use non-random criteria for selecting samples. Researchers can spend months, or even years, developing the study design, questions, and other methods for a single survey to ensure high-quality data is collected. While this book focuses on the analysis methods of complex surveys, understanding the entire survey life cycle can provide a better insight into what types of analyses should be conducted on the data. The survey life cycle consists of the stages required to successfully execute a survey project. Each stage influences the timing, costs, and feasibility of the survey, consequently impacting the data collected and how it should be analyzed. The survey life cycle starts with a research topic or question of interest (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs. To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail (e.g., Dillman, Smyth, and Christian (2014), Groves et al. (2009), (Tourangeau2000psych?), (Bradburn2004?), Valliant, Dever, and Kreuter (2013), and Paul P. Biemer and Lyberg (2003)). 2.1 Pre-Survey Planning When starting a survey, there are multiple things to consider. Errors are the differences between the true values of the variables being studied and the values obtained through the survey. Each step and decision made before the launch of the survey can impact the types of error that are introduced into the data, which in turn impact how to interpret the results. Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement (Groves et al. (2009)): Representation Sampling Error: Error produced when selecting a sample, the subset of the population, from the sampling frame, the list from which the sample is drawn (there is no sampling error if conducting a census) Coverage Error: A mismatch between the population of interest (also known as the target population or study population) and the sampling frame Nonresponse Error: Differences between those who responded and did not respond to the survey (unit nonresponse) or a given question (item nonresponse) Adjustment Error: Error introduced during post-survey statistical adjustments Measurement Validity: A mismatch between the topic of interest and the question(s) used to collect that information Measurement Error: A mismatch between what the researcher asked and how the respondent answered Processing Error: Edits by the researcher to responses provided by the respondent (e.g., adjustments to data based on illogical responses) Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the total survey error, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results {Paul P. Biemer (2010)}. However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money: Sampling Error Tradeoff: Researchers can increase the sample size to reduce sampling error; however, larger samples can be expensive and time-consuming to find. Coverage Error Tradeoff: Researchers can search for more accurate and updated sampling frames, but they can be difficult to construct or obtain. Nonresponse Error Tradeoff: Researchers can increase or diversify efforts to improve survey participation but this may be resource-intensive while not entirely removing nonresponse bias. Adjustment Error Tradeoff: Weighting, or a statistical technique used to adjust the contribution of individual survey responses to the final survey estimates, is typically done to make the sample more representative of the target population. However, if researchers do not carefully execute the adjustments or base them on inaccurate information, they can introduce new biases, leading to less accurate estimates. Validity Error Tradeoff: Reseachers can increase validity through a variety of ways, such as extensive research, using established scales, or collaborating with a psychometrician during survey design. However, doing so lengthens the amount of time and resources needed to complete survey design. Measurement Error Tradeoff: Reseachers can use techniques such as questionnaire testing and cognitive interviewing to ensure respondents are answering questions as expected. However, these activities also require time and resources to complete. Processing Error: Researchers can impose rigorous data cleaning and validation processes. However, this requires supervision, training, and time. The challenge for survey researchers is to find the optimal tradeoffs among these errors. They must carefully consider ways to reduce each error source and total survey error as a whole while balancing their study’s objectives and resources. For survey analysts, understanding decisions that researchers took to minimize these error sources can impact how results are interpreted. The remainder of this chapter dives into key considerations for survey development. We explore how to consider each of these sources of error, and how these error sources can inform the interpretations of the data. 2.2 Study Design From formulating methodologies to choosing appropriate sampling frame, the study design phase is where the blueprint for a successful survey takes shape. Study design encompasses multiple parts of the survey life cycle, including decisions on the population of interest, survey mode (the format through which a survey is administered to respondents), timeline, and questionnaire design. Knowing who and how to survey individuals depends on the study’s goals and the feasibility of implementation. This section explores the strategic planning that lays the foundation for a survey. 2.2.1 Sampling Design Who we want to survey is known as the population of interest. The population of interest could be broad, such as “all adults age 18+ living in the U.S.” or a specific population based on a particular characteristic or location. For example, we may want to know about “adults age 18-24 who live in North Carolina” or “eligible voters living in Illinois.” However, to survey individuals in these populations of interest, a sampling frame is needed with contact information. If researchers are looking at eligible voters, the sampling frame could be the voting registry for a given state or area. The sampling frame will most likely be imperfect for more broad target populations like all adults in the United States. In these cases, researchers may choose to use a sampling frame of mailing addresses and send the survey to households, or they may choose to use random digit dialing (RDD) and call random phone numbers (that may or may not be assigned, connected, and working). These imperfect sampling frames can result in coverage error where there is a mismatch between the target population and the list of individuals researchers can select. For example, if a researcher is looking to obtain estimates for “all adults age 18+ living in the U.S.”, using a sampling frame from mailing addresses will be missing specific types of individuals, such as the homeless, transient populations, and incarcerated individuals. Additionally, many households have more than one adult living there, so researchers would need to consider how to get a specific individual to fill out the survey (called within household selection) or adjust the target population to report on “U.S. households” instead of “individuals.” Once the researchers have selected the sampling frame, the next step is determining how to select individuals for the survey. In rare cases, researchers may wish to conduct a census and survey everyone on the sampling frame. However, the ability to implement a questionnaire at that scale is something only some can do (e.g., government censuses). Instead, researchers choose to sample individuals and use weights to estimate numbers in the target population. There are a variety of different sampling methods that can be used, and more information on these can be found in Chapter 3. This decision of which sampling method to use impacts sampling error and can be accounted for in weighting. Example: Number of Pets in a Household Let’s use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let’s assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households (Harter et al. (2016)). To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state). Throughout this chapter, we will build on this example research question to plan a survey. 2.2.2 Data Collection Planning With the sampling design decided, researchers can then decide on how to survey these individuals. Specifically, the modes used for contacting and surveying the sample, how frequently to send reminders and follow-ups, and the overall timeline of the study are four of the major data collection determinations. Traditionally, researchers have considered four main modes3. For the purposes of this overview we will focus on these four main modes for conducting surveys: Computer Assisted Personal Interview (CAPI; also known as face-to-face or in-person interviewing) Computer Assisted Telephone Interview (CATI; also known as phone or telephone interviewing) Computer Assisted Web Interview (CAWI; also known as web or on-line interviewing) Paper and Pencil Interview (PAPI) Researchers can use a single mode to collect data or multiple modes (also called mixed modes). Using mixed modes can allow for broader reach and increase response rates depending on the target population ((deLeeuw2005?), DeLeeuw (2018), Paul P. Biemer et al. (2017)). For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis. When selecting which mode, or modes, to use, understanding the unique aspects of the chosen target population and sampling frame will provide insight into how they can best be reached and engaged. For example, if we plan to survey adults aged 18-24 who live in North Carolina, asking them to complete a survey using CATI (i.e., over the phone) would most likely not be as successful as other modes like the web. This age group does not talk on the phone as much as other generations, and often do not answer their phones for unknown numbers. Additionally, the mode for contacting respondents relies on what information is available on the sampling frame. For example, if our sampling frame includes an email address, we could email our selected sample members to convince them to complete a survey. Or if the sampling frame is a list of mailing addresses, researchers would have to contact sample members with a letter. It is important to note that there can be a difference between the contact and survey modes. For example, if we have a sampling frame with addresses, we can send a letter to our sample members and provide information on how to complete a web survey. Or we could use mixed-mode surveys and send sample members a paper and pencil survey with our letter and also ask them to complete the survey online. Combining different contact modes and different survey modes can be useful in reducing unit nonresponse error–where the entire unit (e.g., a household) does not respond to the survey at all–as different sample members may respond better to different contact and survey modes. However, when considering which modes to use, it is important to make access to the survey as easy as possible for sample members to reduce burden and unit nonresponse. Another way to reduce unit nonresponse error is through varying the language of the contact materials (Dillman, Smyth, and Christian (2014)). People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke “urgent” or “important” thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL. A study timeline may also determine the number and types of contacts. If the timeline is long, then there is a lot of time for follow-ups and varying the message in contact materials. If the timeline is short, then fewer follow-ups can be implemented. Many studies will start with the tailored design method put forth by Dillman, Smyth, and Christian (2014) and implement 5 contacts: Prenotice letting sample members know the survey is coming Invitation to complete the survey Reminder that also thanks respondents that may have already completed the survey Reminder (with a replacement paper survey if needed) Final reminder This method is easily adaptable based on the study timeline and needs but provides a sound starting point for most studies. Example: Number of Pets in a Household Let’s return to our example of a researcher who wants to know the average number of pets in a household. We are using a sampling frame of mailing addresses, so we recommend starting our data collection with letters mailed to households, but later in data collection, we want to send interviewers to the house to conduct an in-person (or CAPI) interview to decrease unit nonresponse error. This means we will have two contact modes (paper and in-person). As mentioned above, the survey mode does not have to be the same as the contact mode, so we recommend a mixed-mode study with both Web and CAPI modes. Let’s assume we have six months for data collection, so we may want to recommend the following protocol: Protocol Example for 6-month Web and CAPI Data Collection Week Contact Mode Contact Message Survey Mode Offered 1 Mail: Letter Prenotice — 2 Mail: Letter Invitation Web 3 Mail: Postcard Thank You/Reminder Web 6 Mail: Letter in large envelope Animal Welfare Discussion Web 10 Mail: Postcard Inform Upcoming In-Person Visit Web 14 In-Person Visit — CAPI 16 Mail: Letter Reminder of In-Person Visit Web, but includes a number to call to schedule CAPI 20 In-Person Visit — CAPI 25 Mail: Letter in large envelope Survey Closing Notice Web, but includes a number to call to schedule CAPI This is just one possible protocol that we can use that starts respondents with web (typically done to reduce costs). However, researchers may want to begin in-person data collection earlier during the data collection period or ask their interviewers to attempt more than two visits with a household. 2.2.3 Questionnaire Design When developing the questionnaire, it can be helpful to first outline the topics to be asked and include the “why” each question or topic is important to the research question(s). This can help researchers better tailor the questionnaire and potentially reduce the number of questions (and thus the burden on the respondent) if topics are deemed irrelevant to the research question. When making these decisions, researchers should also consider questions needed for weighting. While we would love to have everyone sampled answer our survey, this is rarely the case. Thus, including questions about demographics in the survey can assist with weighting for nonresponse errors (both unit and item nonresponse). Knowing the details of the sampling plan and what may impact coverage error and sampling error can help researchers determine what types of demographics to include. Researchers can benefit from the work of others by using questions from other surveys. Demographic questions such as race, ethnicity, or education often use questions from a government census or other official surveys. Other survey questions can be found using question banks which are a compilation of questions that have been asked across various surveys such as the Inter-university Consortium for Political and Social Research (ICPSR) variable search. If a question does not exist in a question bank, researchers can craft their own. When creating their own questions, researchers should start with the research question or topic and attempt to write questions that match the concept. The closer the question asked is to the overall concept, the better validity there is. For example, if the researcher wants to know how people consume TV series and movies but only asks a question about how many TVs are in the house, then they would be missing other ways that people watch TV series and movies, such as on other devices or at places outside of the home. As mentioned above, researchers can employ techniques to increase the validity of their questionnaire. For example, questionnaire testing involves conducting a pilot of the survey instrument to identify and fix potential issues before the main survey is conducted. Cognitive interviewing is a technique where researchers walk through the survey with participants, encouraging them to speak their thoughts out loud to uncover how they interpret and understand survey questions. Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in measurement error, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes (e.g., Dillman, Smyth, and Christian (2014), (Fowler1989?), (Bradburn2004?), (Tourangeau2004spacing?)). Example: Number of Pets in a Household As part of our survey on the average number of pets in a household, researchers may want to know what animal most people prefer to have as a pet. Let’s say we have the following question in our survey: What animal do you prefer to have as a pet? Dogs Cats This question may have validity issues as it only provides the options of “dogs” and “cats” to respondents, and interpretation of the data could be incorrect. For example, if we had 100 respondents who answered the question and 50 selected dogs, then the results of this question cannot be “50% of the population prefers to have a dog as a pet” as only two response options were provided. If a respondent taking our survey prefers turtles, they could either be forced to choose a response between these two (i.e., interpret the question as “between dogs and cats, which do you prefer?” and result in measurement error), or they may not answer the question (which results in item nonresponse error). Based on this, the interpretation of this question should be “When given a choice between dogs and cats, 50% of respondents preferred to have a dog as a pet.” To avoid this issue, researchers should consider these possibilities and adjust the question accordingly. One simple way could be to add an “other” response option to give respondents a chance to provide a different response. The “other” response option could then include a way for respondents to write in what their other preference is. For example, this question could be rewritten as What animal do you prefer to have as a pet? Dogs Cats Other, please specify: Researchers can then code the responses from the open-ended box and get a better understanding of the respondent’s choice of preferred pet. Interpreting this question becomes easier as researchers no longer need to qualify the results with the choices provided. This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter @(c04-understanding-survey-data-documentation) provides further details on how to review existing survey documentation to inform our analyses. 2.3 Data Collection Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection ((Schouten2018?)). Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group. 2.4 Post-Survey Processing After data collection, a variety of activities need to be conducted before we can analyze the survey. Multiple decisions made during this post-survey phase can assist researchers in reducing different error sources, such as through weighting to account for the sample selection. Knowing the decisions researchers made in creating the final analytic data can impact how analysts use the data and interpret the results. 2.4.1 Data Cleaning and Imputation Post-survey cleaning and imputation is one of the first steps researchers will do to get the survey responses into a dataset for use by analysts. Data cleaning can consist of cleaning inconsistent data (e.g., with skip pattern errors or multiple questions throughout the survey being consistent with each other), editing numeric entries or open-ended responses for grammar and consistency, or recoding open-ended questions into categories for analysis. There is no universal set of fixed rules that every project must adhere to. Instead, each project or research study should establish its own guidelines and procedures for handling various cleaning scenarios based on its specific objectives. Researchers should use their best judgment to ensure data integrity, and all decisions should be documented and available to those using the data in the analysis. Each decision a researcher makes impacts processing error, so often researchers will have multiple people review these rules or recode open-ended data and adjudicate any differences in an attempt to reduce this error. Another crucial step in post-survey processing is imputation. Often, there is item nonresponse where respondents do not answer specific questions. If the questions are crucial to analysis efforts or the research question, researchers may implement imputation in an effort to reduce item nonresponse error. Imputation is a technique for replacing missing or incomplete data values with estimated values. However, as imputation is a way of assigning a value to missing data based on an algorithm or model, it can also introduce processing error, so researchers should consider the overall implications of imputing data compared to having item nonresponse. There are multiple ways imputation can be conducted. We recommend reviewing other resources like (Kim2021?) for more information. Example: Number of Pets in a Household Let’s return to the question we created to ask about animal preference. The “other specify” invites respondents to specify the type of animal they prefer to have as a pet. If respondents entered answers such as “puppy,” “turtle,” “rabit,” “rabbit,” “bunny,” “ant farm,” “snake,” “Mr. Purr,” then researchers may wish to categorize these write-in responses to help with analysis. In this example, “puppy” could be assumed to be a reference to a “Dog”, and could be recoded there. The misspelling of “rabit” could be coded along with “rabbit” and “bunny” into a single category of “Bunny or Rabbit”. These are relatively standard decisions that a researcher could make. The remaining write-in responses could be categorized in a few different ways. “Mr. Purr,” which may be someone’s reference to their own cat, could be recoded as “Cat”, or it could remain as “Other” or some category that is “Unknown”. Depending on the number of responses related to each of the others, they could all be combined into a single “Other” category, or maybe categories such as “Reptiles” or “Insects” could be created. Each of these decisions may impact the interpretation of the data, so our researcher should document the types of responses that fall into each of the new categories and any decisions made. 2.4.2 Weighting Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an “analysis weight” variable that combines these adjustments. However, weighting itself can also introduce adjustment error, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own ((Valliant2018weights?)). Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter 4) and work with the data and analysis weights provided to analyze and interpret survey results correctly. Example: Number of Pets in a Household In the simple example of our survey, we decided to use a stratified sample by state to select our sample members. Knowing this sampling design, our researcher can include selection weights for analysis that account for how the sample members were selected for the survey. Additionally, the sampling frame may have the type of building associated with each address, so we could include the building type as a potential nonresponse weighting variable, along with some interviewer observations that may be related to our research topic of the average number of pets in a household. Combining these weights, we can create an analytic weight that researchers will need to use when analyzing the data. 2.4.3 Disclosure Before data is made publicly available, researchers will need to ensure that individual respondents can not be identified by the data when confidentiality is required. There are a variety of different methods that can be used, including data swapping, top or bottom coding, coarsening, and perturbation. In data swapping, researchers may swap specific data values across different respondents so that it does not impact insights from the data but ensures that specific individuals cannot be identified. For extreme values, top and bottom coding is sometimes used. For example, researchers may top-code income values such that households with income greater than $99,999,999 are coded into a single category of $99,999,999 or more. Other disclosure methods may include aggregating response categories or location information to avoid having only a few respondents in a given group and thus be identified. For example, researchers may use coarsening to display income in categories instead of as a continuous variable. Data producers may also perturb the data by adding random noise. There is as much art as there is a science to the methods used for disclosure, and in documentation, researchers should only provide high-level comments that disclosure was conducted and not specific details to ensure nobody can reverse the disclosure and thus identify individuals. For more information on different disclosure methods, please see (Skinner2009?) and AAPOR Standards4. 2.4.4 Documentation Documentation is a critical step of the survey life cycle. Researchers systematically record all the details, decisions, procedures, and methodologies to ensure transparency, reproducibility, and the overall quality of survey research. Proper documentation allows analysts to understand, reproduce, and evaluate the study’s methods and findings. Chapter @(c04-understanding-survey-data-documentation) dives into how analysts should use survey data documentation. 2.5 Post-survey data analysis and reporting After completing the survey life cycle, the data is ready for analysts to use. The rest of this book continues from this point. If you’re interested in learning more about the steps discussed here, you can explore the references cited throughout this chapter. References "],["c03-specifying-sample-designs.html", "Chapter 3 Specifying sample designs and replicate weights in {srvyr} 3.1 Introduction 3.2 Common sampling designs 3.3 Replicate weights 3.4 Understanding survey design documentation 3.5 Exercises", " Chapter 3 Specifying sample designs and replicate weights in {srvyr} Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(tidycensus) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) To help explain the different types of sample designs, this chapter will use the api and scd data that comes in the {survey} package: data(api) data(scd) Additionally, we have created multiple analytic datasets for use in this book on a directory on OSF5. To load any data used in the book that is not included in existing packages, we have created a helper function read_osf(). This chapter uses data from the Residential Energy Consumption Survey (RECS), so we will use the following code to load the RECS data to use later in this chapter: recs_in &lt;- read_osf(&quot;recs_2015.rds&quot;) 3.1 Introduction The primary reason for using packages like {survey} and {srvyr} is to incorporate the sampling design or replicate weights into estimates. By incorporating the sampling design or replicate weights, precision estimates (e.g., standard errors and confidence intervals) are appropriately calculated. In this chapter, we will introduce common sampling designs and common types of replicate weights, the mathematical methods for calculating estimates and standard errors for a given sampling design, and the R syntax to specify the sampling design or replicate weights. While we will show the math behind the estimates, the functions in these packages will do the calculation. To deeply understand the math and the derivation, refer to Särndal, Swensson, and Wretman (2003), Wolter (2007), or Fuller (2011). The general process for estimation in the {srvyr} package is to: Create a tbl_svy object (a survey object) using: as_survey_design() or as_survey_rep() Subset data (if needed) using filter() (subpopulations) Specify domains of analysis using group_by() Within summarize(), specify variables to calculate, including means, totals, proportions, quantiles, and more This chapter includes details on the first step - creating the survey object. The other steps are detailed in the next several chapters. 3.2 Common sampling designs A sampling design is the method used to draw a sample. Both logistical and statistical elements are considered when developing a sampling design. When specifying a sampling design in R, the levels of sampling are specified along with the weights. Each record of a weight is constructed so that the particular record represents that many units in the population. For example, in a survey of 6th-grade students in the United States, the weight associated with each responding student reflects how many students that record represents. Generally, the sum of the weights corresponds to the total population size, although some studies may have the sum of the weights equal to the number of respondent records. Some common terminology across the designs are: sample size, generally denoted as \\(n\\), is the number of units selected to be sampled population size, generally denoted as \\(N\\), is the number of units in the target population sampling frame, the list of units from which the sample is drawn 3.2.1 Simple random sample without replacement The simple random sample (SRS) without replacement is a sampling design where a fixed sample size is selected from a sampling frame, and every possible subsample has an equal probability of selection. Requirements: The sampling frame must include the entire population. Advantages: SRS requires no information about the units apart from contact information. Disadvantages: The sampling frame may not be available for the entire population. This design is not generally feasible for in-person data collection. Example: Randomly select students in a university from a roster provided by the registrar’s office. The math The estimate for the population mean of variable \\(y\\) is: \\[\\bar{y}=\\frac{1}{n}\\sum_{i=1}^n y_i\\] where \\(\\bar{y}\\) represents the sample mean, \\(n\\) is the total number of respondents (or observations), and \\(y_i\\) is each individual value of \\(y\\). The estimate of the standard error of the mean is: \\[se(\\bar{y})=\\sqrt{\\frac{s^2}{n}\\left( 1-\\frac{n}{N} \\right)}\\] where \\[s^2=\\frac{1}{n-1}\\sum_{i=1}^n\\left(y_i-\\bar{y}\\right)^2.\\] and \\(N\\) is the population total. This standard error estimate might look very similar to equations in other applications except for the part on the right side of the equation: \\(1-\\frac{n}{N}\\). This is called the finite population correction factor (FPC), and if the size of the frame, \\(N\\), is very large, the FPC is negligible, so it is often ignored. To estimate proportions, we define \\(x_i\\) as the indicator if the outcome is observed. That is, \\(x_i=1\\) if the outcome is observed, and \\(x_i=0\\) if the outcome is not observed. Then the estimated proportion from an SRS design is: \\[\\hat{p}=\\frac{1}{n}\\sum_{i=1}^n x_i \\] and the estimated standard error of the proportion is: \\[se(\\hat{p})=\\sqrt{\\frac{\\hat{p}(1-\\hat{p})}{n-1}\\left(1-\\frac{n}{N}\\right)} \\] The syntax If a sample was drawn through SRS and had no nonresponse or other weighting adjustments, in R, specify this design as: srs1_des &lt;- dat %&gt;% as_survey_design(fpc = fpcvar) where dat is a tibble or data.frame with the survey data, and fpcvar is a variable on the tibble indicating the sampling frame’s size. If the frame is very large, sometimes the frame size is not provided. In that case, the FPC is not needed, and specify the design as: srs2_des &lt;- dat %&gt;% as_survey_design() If some post-survey adjustments were implemented and the weights are not all equal, specify the design as: srs3_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, fpc = fpcvar) where wtvar is the variable for the weight on the data. Again, the FPC can be omitted if it is unnecessary because the frame is large. Example The {survey} package in R provides some example datasets that we will use throughout this chapter. The documentation provides detailed information about the variables. One of the example datasets we will use is from the Academic Performance Index (API). The API was a program administered by the California Department of Education, and the {survey} package includes a population file (sample frame) of all schools with at least 100 students and several different samples pulled from that data using different sampling methods. For this first example, we will use the apisrs dataset, which contains an SRS of 200 schools. For printing purposes, we create a new dataset called apisrs_slim, which sorts the data by the school district and school ID and subsets the data to only a few columns. The SRS sample data is illustrated below: apisrs_slim &lt;- apisrs %&gt;% as_tibble() %&gt;% arrange(dnum, snum) %&gt;% select(cds, dnum, snum, dname, sname, fpc, pw) apisrs_slim ## # A tibble: 200 × 7 ## cds dnum snum dname sname fpc pw ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 19642126061220 1 1121 ABC Unified Haske… 6194 31.0 ## 2 19642126066716 1 1124 ABC Unified Stowe… 6194 31.0 ## 3 36675876035174 5 3895 Adelanto Elementary Adela… 6194 31.0 ## 4 33669776031512 19 3347 Alvord Unified Arlan… 6194 31.0 ## 5 33669776031595 19 3352 Alvord Unified Wells… 6194 31.0 ## 6 31667876031033 39 3271 Auburn Union Elementary Cain … 6194 31.0 ## 7 19642876011407 42 1169 Baldwin Park Unified Deanz… 6194 31.0 ## 8 19642876011464 42 1175 Baldwin Park Unified Heath… 6194 31.0 ## 9 19642956011589 48 1187 Bassett Unified Erwin… 6194 31.0 ## 10 41688586043392 49 4948 Bayshore Elementary Baysh… 6194 31.0 ## # ℹ 190 more rows Table 3.1 provides details on all the variables in this dataset. TABLE 3.1: Overview of Variables in api Data Variable Name Description cds Unique identifier for each school dnum School district identifier within county snum School identifier within district dname District Name sname School Name fpc Finite population correction factor (FPC) pw Weight To create the tbl_survey object for this SRS data, the design should be specified as follows: apisrs_des &lt;- apisrs_slim %&gt;% as_survey_design(weights = pw, fpc = fpc) apisrs_des ## Independent Sampling design ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - fpc: fpc ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), fpc (dbl), pw (dbl) In the printed design object above, the design is described as an “Independent Sampling design,” which is another term for SRS. The ids are specified as 1, which means there is no clustering (a topic described later in this chapter), the FPC variable is indicated, and the weights are indicated. We can also look at the summary of the design object, and see the distribution of the probabilities (inverse of the weights) along with the population size and a list of the variables in the dataset. summary(apisrs_des) ## Independent Sampling design ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0323 0.0323 0.0323 0.0323 0.0323 0.0323 ## Population size (PSUs): 6194 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;fpc&quot; &quot;pw&quot; 3.2.2 Simple random sample with replacement Similar to the SRS design, the simple random sample with replacement (SRSWR) design randomly selects the sample from the entire sampling frame. However, while SRS removes sampled units before selecting again, the SRSWR instead replaces each sampled unit before drawing again, so units can be selected more than once. Requirements: The sampling frame must include the entire population. Advantages: SRSWR requires no information about the units apart from contact information. Disadvantages: The sampling frame may not be available for the entire population. This design is not generally feasible for in-person data collection. Units can be selected more than once, resulting in a smaller realized sample size as receiving the duplicate information from a single respondent does not provide additional information. For small populations, SRSWR has larger standard errors than SRS designs. Example: A professor puts all students’ names on paper slips and selects them randomly to ask students questions, but the professor replaces the paper after calling on the student so they can be selected again at any time. The math The estimate for the population mean of variable \\(y\\) is: \\[\\bar{y}=\\frac{1}{n}\\sum_{i=1}^n y_i\\] and the estimate of the standard error of mean is: \\[se(\\bar{y})=\\sqrt{\\frac{s^2}{n}}\\] where \\[s^2=\\frac{1}{n-1}\\sum_{i=1}^n\\left(y_i-\\bar{y}\\right)^2.\\] To calculate the estimated proportion, we define \\(x_i\\) as the indicator that the outcome is observed (as we did with SRS): \\[\\hat{p}=\\frac{1}{n}\\sum_{i=1}^n x_i \\] and the estimated standard error of the proportion is: \\[se(\\hat{p})=\\sqrt{\\frac{\\hat{p}(1-\\hat{p})}{n}} \\] The syntax If you had a sample that was drawn through SRSWR and had no nonresponse or other weighting adjustments, in R, you should specify this design as: srswr1_des &lt;- dat %&gt;% as_survey_design() where dat is a tibble or data.frame containing your survey data. This syntax is the same as a SRS design without an FPC. Therefore, with large enough samples that do not have an FPC, the underlying formulas for SRS and SRSWR designs are the same. If some post-survey adjustments were implemented and the weights are not all equal, specify the design as: srswr2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar) where wtvar is the variable for the weight on the data. Example The {survey} package does not include an example of SRSWR, so to illustrate this design we create an example from the population data provided. We call this new dataset apisrswr. set.seed(409963) apisrswr &lt;- apipop %&gt;% as_tibble() %&gt;% slice_sample(n = 200, replace = TRUE) %&gt;% select(cds, dnum, snum, dname, sname) %&gt;% mutate( weight = nrow(apipop)/200 ) head(apisrswr) ## # A tibble: 6 × 6 ## cds dnum snum dname sname weight ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 43696416060065 533 5348 Palo Alto Unified Jordan (Da… 31.0 ## 2 07618046005060 650 509 San Ramon Valley Unified Alamo Elem… 31.0 ## 3 19648086085674 457 2134 Montebello Unified La Merced … 31.0 ## 4 07617056003719 346 377 Knightsen Elementary Knightsen … 31.0 ## 5 19650606023022 744 2351 Torrance Unified Carr (Evel… 31.0 ## 6 01611196090120 6 13 Alameda City Unified Paden (Wil… 31.0 Because this is a SRS design with replacement, there will be duplicates in the data. It is important to keep the duplicates in the data for proper estimation, but for reference we can view the duplicates in the example data we just created. apisrswr %&gt;% group_by(cds) %&gt;% filter(n()&gt;1) %&gt;% arrange(cds) ## # A tibble: 4 × 6 ## # Groups: cds [2] ## cds dnum snum dname sname weight ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 15633216008841 41 869 Bakersfield City Elem Chipman Junio… 31.0 ## 2 15633216008841 41 869 Bakersfield City Elem Chipman Junio… 31.0 ## 3 39686766042782 716 4880 Stockton City Unified Tyler Skills … 31.0 ## 4 39686766042782 716 4880 Stockton City Unified Tyler Skills … 31.0 We created a weight variable in this example data, which is the inverse of the probability of selection. To specify the sampling design for apisrswr, the following syntax should be used: apisrswr_des &lt;- apisrswr %&gt;% as_survey_design(weights = weight) apisrswr_des ## Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - weights: weight ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), weight (dbl) summary(apisrswr_des) ## Independent Sampling design (with replacement) ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0323 0.0323 0.0323 0.0323 0.0323 0.0323 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;weight&quot; In the chunk above, the design object is printed, and the object summary is shown. Both note that the sampling is done “with replacement” because no FPC was specified. The probabilities, which are derived from the weights, are summarized in the summary. 3.2.3 Stratified sampling A population is divided into mutually exclusive subpopulations (strata), and then samples are selected independently within each stratum. Requirements: The sampling frame must include the information to divide the population into groups for every unit. Advantages: This design ensures sample representation in all subpopulations. If the strata are correlated with survey outcomes, a stratified sample has smaller standard errors compared to a SRS sample of the same size. Thus is a more efficient design. Disadvantages: Auxiliary data may not exist to divide the sampling frame into groups, or the data may be outdated. Examples: Example 1: A population of North Carolina residents could be separated into urban and rural areas, and then a SRS of residents from both rural and urban areas is selected independently. This ensures there are residents from both areas in the sample. Example 2: There are three primary general-purpose law enforcement agencies in the US: local police, sheriff’s departments, and state police. In a survey of law enforcement agencies, the agency type could be used to form strata. The math Let \\(\\bar{y}_h\\) be the sample mean for stratum \\(h\\), \\(N_h\\) be the population size of stratum \\(h\\), and \\(n_h\\) be the sample size of stratum \\(h\\). Then the estimate for the population mean under stratified SRS sampling is: \\[\\bar{y}=\\frac{1}{N}\\sum_{h=1}^H N_h\\bar{y}_h\\] and the estimate of the standard error of \\(\\bar{y}\\) is: \\[se(\\bar{y})=\\sqrt{\\frac{1}{N^2} \\sum_{h=1}^H N_h^2 s_h^2\\left(1-\\frac{n_h}{N_h}\\right)} \\] where \\[s_h^2=\\frac{1}{n_h-1}\\sum_{i=1}^{n_h}\\left(y_{i,h}-\\bar{y}_h\\right)^2.\\] For estimates of proportions, let \\(\\hat{p}_h\\) be the estimated proportion in stratum \\(h\\). Then the population proportion estimate is: \\[\\hat{p}= \\frac{1}{N}\\sum_{h=1}^H N_h \\hat{p}_h\\] where \\(H\\) is the total number of clusters. The standard error of the proportion is: \\[se(\\hat{p}) = \\frac{1}{N} \\sqrt{ \\sum_{h=1}^H N_h^2 \\frac{\\hat{p}_h(1-\\hat{p}_h)}{n_h-1} \\left(1-\\frac{n_h}{N_h}\\right)}\\] The syntax To specify a stratified SRS design in {srvyr} when using the FPC, that is, where the population sizes of the strata are not too large and are known, that is, you are using the FPC, specify the design as: stsrs1_des &lt;- dat %&gt;% as_survey_design(fpc = fpcvar, strata = stratvar) where fpcvar is a variable on your data that indicates \\(N_h\\) for each row, and stratavar is a variable indicating the stratum for each row. You can omit the FPC if it is not applicable. Additionally, you can indicate the weight variable if it is present where wtvar is a variable on your data with a numeric weight. stsrs2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, strata = stratvar) Example In the example API data, apistrat is a stratified random sample, stratified by school type (stype). As with the SRS example above, we sort and select specific variables for use in printing. The data are illustrated below, including a count of the number of cases per stratum: apistrat_slim &lt;- apistrat %&gt;% as_tibble() %&gt;% arrange(dnum, snum) %&gt;% select(cds, dnum, snum, dname, sname, stype, fpc, pw) apistrat_slim %&gt;% count(stype, fpc) ## # A tibble: 3 × 3 ## stype fpc n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 E 4421 100 ## 2 H 755 50 ## 3 M 1018 50 The FPC is the same within each stratum, and 100 elementary schools were sampled, while 50 schools were sampled from both the middle and high school levels. This design should be specified as follows: apistrat_des &lt;- apistrat_slim %&gt;% as_survey_design(strata = stype, weights = pw, fpc = fpc) apistrat_des ## Stratified Independent Sampling design ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - fpc: fpc ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), stype (fct), fpc (dbl), pw (dbl) summary(apistrat_des) ## Stratified Independent Sampling design ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0226 0.0226 0.0359 0.0401 0.0534 0.0662 ## Stratum Sizes: ## E H M ## obs 100 50 50 ## design.PSU 100 50 50 ## actual.PSU 100 50 50 ## Population stratum sizes (PSUs): ## E H M ## 4421 755 1018 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;stype&quot; &quot;fpc&quot; &quot;pw&quot; When printing the object, it is specified as a “Stratified Independent Sampling design,” also known as a stratified SRS, and the strata variable is included. Printing the summary we see a distribution of probabilities, as we saw with SRS, but we also see the sample and populations sizes by stratum. 3.2.4 Clustered sampling A population is divided into mutually exclusive subgroups called clusters or primary sampling units (PSUs). A random selection of PSUs is sampled, and then another level of sampling is done within these clusters. There can be multiple levels of this selection. Clustered sampling is often used when a list of the entire population is not available, or data collection involves interviewers needing direct contact with respondents. Requirements: There must be a way to divide the population into clusters. Clusters are commonly structural such as institutions (e.g., schools, prisons) or geography (e.g., states, counties). Advantages: Clustered sampling is advantageous when data collection is done in person, so interviewers are sent to specific sampled areas rather than completely at random across a country. With cluster sampling, a list of the entire population is not necessary. For example, if sampling students, you do not need a list of all students but only a list of all schools. Once the schools are sampled, lists of students can be obtained within the sampled schools. Disadvantages: Compared to a simple random sample for the same sample size, clustered samples generally have larger standard errors of estimates. Examples: Example 1: Consider a study needing a sample of 6th-grade students in the United States, no list likely exists of all these students. However, it is more likely to obtain a list of schools that have 6th graders, so a study design could select a random sample of schools that have 6th graders. The selected schools can then provide a list of students to do a second stage of sampling where 6th-grade students are randomly sampled within each of the sampled schools. This is a one-stage sample design and will be the type of design we will discuss in the formulas below. Example 2: Consider a study sending interviewers to households for a survey. This is a more complicated example that requires two levels of selection to efficiently use interviewers in geographic clusters. First, in the U.S., counties could be selected as the PSU, then Census block groups within counties could be selected as the secondary sampling unit (SSU). Households could then be randomly sampled within the block groups. This type of design is popular for in-person surveys as it reduces the travel necessary for interviewers. The math Consider a population where there are \\(N\\) clusters and \\(n\\) clusters are sampled via SRS. Units within each sampled cluster are sampled via SRS as well. Let \\(M_i\\) be the number of units in cluster \\(i\\) and \\(\\bar{y}_i\\) be the sample mean of cluster \\(i\\). Then, a ratio estimator of the population mean is: \\[\\bar{y}=\\frac{\\sum_{i=1}^n M_i \\bar{y}_i}{ \\sum_{i=1}^n M_i}\\] Note this is a consistent but biased estimator. Often the population size is not known, so this is a method to estimate a mean without knowing the population size. The estimated standard error of the mean is: \\[se(\\bar{y})=\\frac{1}{\\hat{N}_{pop} } \\sqrt{\\frac{N^2 (1-\\frac{n}{N})}{n}\\frac{1}{n-1} \\sum_{i=1}^n (M_i\\bar{y}_i -\\hat{t}/N)^2 + \\frac{N}{n} \\sum_{i=1}^n \\frac{M_i^2}{m_i}\\left(1-\\frac{m_i}{M_i}\\right)s^2_i }\\] where \\(\\hat{N}_{pop}\\) is the estimated population size, \\(\\hat{t}\\) is the estimated total, and \\(s_i^2\\) is the sample variance of cluster \\(i\\). For estimates of proportions, the estimated proportion is: \\[\\hat{p}=\\frac{\\sum_{i=1}^n M_i \\hat{p}_i}{ \\sum_{i=1}^n M_i}\\] and the associated standard error estimate is: \\[se(\\hat{p})=\\frac{1}{\\hat{N}_{pop} } \\sqrt{\\frac{N^2 (1-\\frac{n}{N})}{n}\\frac{1}{n-1} \\sum_{i=1}^n (M_i\\hat{p}_i -\\hat{t}/N)^2 + \\frac{N}{n} \\sum_{i=1}^n \\frac{M_i^2}{m_i}\\left(1-\\frac{m_i}{M_i}\\right)s^2_i }\\] where \\(s^2_i\\) is defined as: \\[s^2_i = \\frac{m_hp_h(1-p_h)}{m_h-1}\\]. The syntax To specify a two-stage clustered design without replacement, use the following syntax: clus2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = c(PSU, SSU), fpc = c(N, M)) where PSU and SSU are the variables indicating the PSU and SSU identifiers, and N and M are the variables indicating the population sizes for each level (i.e., N is the number of clusters, and M is the number of units within each cluster). Note that N will be the same for all records (within a strata), and M will be the same for all records within the same cluster. If clusters were sampled with replacement or from a very large population, a FPC is unnecessary. Additionally, only the first stage of selection is necessary regardless of whether the units were selected with replacement at any stage. The subsequent stages of selection are ignored in computation as their contribution to the variance is overpowered by the first stage (see Särndal, Swensson, and Wretman (2003) or Wolter (2007) for a more in-depth discussion). Therefore, the syntax below will yield the same estimates in the end: clus2wra_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = c(PSU, SSU)) clus2wrb_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = PSU) Example The survey package includes a two-stage cluster sample data, apiclus2, in which school districts were sampled, and then a random sample of five schools was selected within each district. For districts with fewer than five schools, all schools were sampled. School districts are identified by dnum, and schools are identified by snum. The variable fpc1 indicates how many districts there are in California (N), and fpc2 indicates how many schools were in a given district with at least 100 students (M). The data has a row for each school. In the data printed below, there are 757 school districts, as indicated by fpc1, and there are nine schools in District 731, one school in District 742, two schools in District 768, and so on as indicated by fpc2. For illustration purposes, the object apiclus2_slim has been created from apiclus2, which subsets the data to only the necessary columns and sorts data. apiclus2_slim &lt;- apiclus2 %&gt;% as_tibble() %&gt;% arrange(desc(dnum), snum) %&gt;% select(cds, dnum, snum, fpc1, fpc2, pw) apiclus2_slim ## # A tibble: 126 × 6 ## cds dnum snum fpc1 fpc2 pw ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int[1d]&gt; &lt;dbl&gt; ## 1 47704826050942 795 5552 757 1 18.9 ## 2 07618126005169 781 530 757 6 22.7 ## 3 07618126005177 781 531 757 6 22.7 ## 4 07618126005185 781 532 757 6 22.7 ## 5 07618126005193 781 533 757 6 22.7 ## 6 07618126005243 781 535 757 6 22.7 ## 7 19650786023337 768 2371 757 2 18.9 ## 8 19650786023345 768 2372 757 2 18.9 ## 9 54722076054423 742 5898 757 1 18.9 ## 10 50712906053086 731 5781 757 9 34.1 ## # ℹ 116 more rows To specify this design in R, the following syntax should be used: apiclus2_des &lt;- apiclus2_slim %&gt;% as_survey_design( ids = c(dnum, snum), fpc = c(fpc1, fpc2), weights = pw ) apiclus2_des ## 2 - level Cluster Sampling design ## With (40, 126) clusters. ## Called via srvyr ## Sampling variables: ## - ids: `dnum + snum` ## - fpc: `fpc1 + fpc2` ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), fpc1 (dbl), fpc2 ## (int[1d]), pw (dbl) summary(apiclus2_des) ## 2 - level Cluster Sampling design ## With (40, 126) clusters. ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.00367 0.03774 0.05284 0.04239 0.05284 0.05284 ## Population size (PSUs): 757 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;fpc1&quot; &quot;fpc2&quot; &quot;pw&quot; The design objects are described as “2 - level Cluster Sampling design” and include the ids (cluster), FPC, and weight variables. The summary notes that the sample includes 40 first-level clusters (PSUs), which are school districts, and 126 second-level clusters (SSUs), which are schools. Additionally, the summary includes a numeric summary of the probabilities and the population size (number of PSUs) as 757. 3.3 Replicate weights Replicate weights are often included on analysis files instead of, or in addition to, the design variables (strata and PSUs). Replicate weights are used as another method to estimate variability and are often used specifically so that design variables are not published as a measure to limit disclosure risk. There are several types of replicate weights, including balanced repeated replication (BRR), Fay’s BRR, jackknife, and bootstrap methods. An overview of the process for using replicate weights is as follows: Divide the sample into subsample replicates that mirror the design of the sample Calculate weights for each replicate using the same procedures for the full-sample weight (i.e., nonresponse and post-stratification) Calculate estimates for each replicate using the same method as the full-sample estimate Calculate the estimated variance, which will be proportional to the variance of the replicate estimates The different types of replicate weights largely differ in step 1 - how the sample is divided into subsamples, and step 4 - which multiplication factors (scales) are used to multiply the variance. 3.3.1 Balanced Repeated Replication (BRR) Method The BRR method requires a stratified sample design with two PSUs in each stratum. Each replicate is constructed by deleting one PSU per stratum using a Hadamard matrix. For the PSU that is included, the weight is generally multiplied by two but may have other adjustments, such as post-stratification. A Hadamard matrix is a special square matrix with entries of +1 or -1 with mutually orthogonal rows. Hadamard matrices must have one row, two rows, or a multiple of four rows. To size of the Hadamard matrix is determined by the first multiple of 4 greater than or equal to the number of strata. For example, if a survey had 7 strata, the Hadamard matrix would be an \\(8\\times8\\) matrix. Additionally, a survey with 8 strata would also have an \\(8\\times8\\) Hadamard matrix. An example of a \\(4\\times4\\) Hadamard matrix is below: \\[ \\begin{array}{rrrr} +1 &amp;+1 &amp;+1 &amp;+1\\\\ +1&amp;-1&amp;+1&amp;-1\\\\ +1&amp;+1&amp;-1&amp;-1\\\\ +1 &amp;-1&amp;-1&amp;+1 \\end{array} \\] The columns specify the strata and the rows the replicate. In the first replicate, all the values are +1, so in each stratum, the first PSU would be used in the estimate. In the second replicate, the first PSU would be used in stratum 1 and 3, while the second PSU would be used in stratum 2 and 4. In the third replicate, the first PSU would be used in stratum 1 and 2, while the second PSU would be used in strata 3 and 4. Finally, in the fourth replicate, the first PSU would be used in strata 1 and 4, while the second PSU would be used in strata 2 and 3. The math A weighted estimate for the full sample is calculated as \\(\\hat{\\theta}\\), and then a weighted estimate for each replicate is calculated as \\(\\hat{\\theta}_r\\) for \\(R\\) replicates. The standard error of the estimate is calculated as follows: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] Specifying replicate weights in R requires specifying the type of replicate weights, the main weight variable, the replicate weight variables, and other options. One of the key options is for mse. If mse=TRUE, variances are computed around the point estimate \\((\\hat{\\theta})\\), whereas if mse=FALSE, variances are computed around the mean of the replicates \\((\\bar{\\theta})\\) instead which looks like this: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\bar{\\theta}\\right)^2}\\] where \\[\\bar{\\theta}=\\frac{1}{R}\\sum_{r=1}^R \\hat{\\theta}_r\\] The default option for mse is to use the global option of “survey.replicates.mse” which is set to FALSE initially unless a user changes it. To determine if mse should be set to TRUE or FALSE, read the survey documentation. If there is no indication in the survey documentation, for BRR, set mse to TRUE. The syntax Replicate weights generally come in groups and are sequentially numbered, such as PWGTP1, PWGTP2, …, PWGTP80 for the person weights in the American Community Survey (ACS) (U.S. Census Bureau 2021) or BRRWT1, BRRWT2, …, BRRWT96 in the 2015 Residential Energy Consumption Survey (RECS) (U.S. Energy Information Administration 2017). This makes it easy to use some of the tidy selection6 functions in R. For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated WT1, WT2, …, WT20, we can use the following syntax (both are equivalent): brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = all_of(str_c(&quot;WT&quot;, 1:20)), type = &quot;BRR&quot;, mse = TRUE ) brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = num_range(&quot;WT&quot;, 1:20), type = &quot;BRR&quot;, mse = TRUE ) If a dataset had WT for the main weight and had 20 BRR weights indicated REPWT1, REPWT2, …, REPWT20, the following syntax could be used (both are equivalent): brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = all_of(str_c(&quot;REPWT&quot;, 1:20)), type = &quot;BRR&quot;, mse = TRUE ) brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = starts_with(&quot;REPWT&quot;), type = &quot;BRR&quot;, mse = TRUE ) If the replicate weight variables are in the file consecutively, the following syntax can also be used: brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = REPWT1:REPWT20, type = &quot;BRR&quot;, mse = TRUE ) Typically, the replicate weights sum to a value similar to the main weight, as they are both supposed to provide population estimates. Rarely an alternative method will be used where the replicate weights have values of 0 or 2 in the case of BRR weights. This would be indicated in the documentation, and Section 3.4 and Chapter 4 discuss how to understand documentation. In this case, the replicate weights are not combined, and the option combined_weights = FALSE should be indicated, as the default value for this argument is TRUE. This specific syntax is shown below: brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = starts_with(&quot;REPWT&quot;), type = &quot;BRR&quot;, combined_weights = FALSE, mse = TRUE ) Example The {survey} package includes a data example from Section 12.2 of Levy and Lemeshow (2013). In this fictional data, two out of five ambulance stations were sampled from each of three emergency service areas (ESAs), thus BRR weights are appropriate with 2 PSUs (stations) sampled in each stratum (ESA). In the code below, BRR weights are created as was done by Levy and Lemeshow (2013). scdbrr &lt;- scd %&gt;% as_tibble() %&gt;% mutate( wt = 5 / 2, rep1 = 2 * c(1, 0, 1, 0, 1, 0), rep2 = 2 * c(1, 0, 0, 1, 0, 1), rep3 = 2 * c(0, 1, 1, 0, 0, 1), rep4 = 2 * c(0, 1, 0, 1, 1, 0) ) scdbrr ## # A tibble: 6 × 9 ## ESA ambulance arrests alive wt rep1 rep2 rep3 rep4 ## &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1 1 120 25 2.5 2 2 0 0 ## 2 1 2 78 24 2.5 0 0 2 2 ## 3 2 1 185 30 2.5 2 0 2 0 ## 4 2 2 228 49 2.5 0 2 0 2 ## 5 3 1 670 80 2.5 2 0 0 2 ## 6 3 2 530 70 2.5 0 2 2 0 To specify the BRR weights, the following syntax is used: scdbrr_des &lt;- scdbrr %&gt;% as_survey_rep( type = &quot;BRR&quot;, repweights = starts_with(&quot;rep&quot;), combined_weights = FALSE, weight = wt ) scdbrr_des ## Call: Called via srvyr ## Balanced Repeated Replicates with 4 replicates. ## Sampling variables: ## - repweights: `rep1 + rep2 + rep3 + rep4` ## - weights: wt ## Data variables: ESA (int), ambulance (int), arrests (dbl), alive (dbl), ## wt (dbl), rep1 (dbl), rep2 (dbl), rep3 (dbl), rep4 (dbl) summary(scdbrr_des) ## Call: Called via srvyr ## Balanced Repeated Replicates with 4 replicates. ## Sampling variables: ## - repweights: `rep1 + rep2 + rep3 + rep4` ## - weights: wt ## Data variables: ESA (int), ambulance (int), arrests (dbl), alive (dbl), ## wt (dbl), rep1 (dbl), rep2 (dbl), rep3 (dbl), rep4 (dbl) ## Variables: ## [1] &quot;ESA&quot; &quot;ambulance&quot; &quot;arrests&quot; &quot;alive&quot; &quot;wt&quot; ## [6] &quot;rep1&quot; &quot;rep2&quot; &quot;rep3&quot; &quot;rep4&quot; Note that combined_weights was specified as FALSE because these weights are simply specified as 0 and 2 and do not incorporate the overall weight. When printing the object, the type of replication is noted as Balanced Repeated Replicates, and the replicate weights and the weight variable are specified. Additionally, the summary lists the variables included. 3.3.2 Fay’s BRR Method Fay’s BRR method for replicate weights is similar to the BRR method in that it uses a Hadamard matrix to construct replicate weights. However, rather than deleting PSUs for each replicate, with Fay’s BRR half of the PSUs have a replicate weight which is the main weight multiplied by \\(\\rho\\), and the other half have the main weight multiplied by \\((2-\\rho)\\) where \\(0 \\le \\rho &lt; 1\\). Note that when \\(\\rho=0\\), this is equivalent to the standard BRR weights, and as \\(\\rho\\) becomes closer to 1, this method is more similar to jackknife discussed in the next section. To obtain the value of \\(\\rho\\), it is necessary to read the documentation as discussed in Section 3.4 and Chapter 4. The math The standard error estimate for \\(\\hat{\\theta}\\) is slightly different than the BRR, due to the addtion of the multipler of , and is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R (1-\\rho)^2} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The syntax The syntax is very similar for BRR and Fay’s BRR. If a dataset had WT0 for the main weight and had 20 BRR weights indicated as WT1, WT2, …, WT20, and Fay’s multiplier is 0.5, use the following syntax: fay_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = num_range(&quot;WT&quot;, 1:20), type = &quot;Fay&quot;, mse = TRUE, rho = 0.5 ) Example The 2015 RECS (U.S. Energy Information Administration 2017) uses Fay’s BRR weights with the final weight as NWEIGHT and replicate weights as BRRWT1 - BRRWT96 with \\(\\rho=0.5\\). On the file, DOEID is a unique identifier for each respondent, TOTALDOL is the total cost of energy, TOTSQFT_EN is the total square footage of the residence, and REGOINC is the Census region. We have already read in the RECS data and created a dataset called recs_in above in the prerequisites. To specify this design, use the following syntax: recs_des &lt;- recs_in %&gt;% as_survey_rep( weights = NWEIGHT, repweights = BRRWT1:BRRWT96, type = &quot;Fay&quot;, rho = 0.5, mse = TRUE, variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC) ) recs_des ## Call: Called via srvyr ## Fay&#39;s variance method (rho= 0.5 ) with 96 replicates and MSE variances. ## Sampling variables: ## - repweights: `BRRWT1 + BRRWT2 + BRRWT3 + BRRWT4 + BRRWT5 + BRRWT6 + BRRWT7 + BRRWT8 + BRRWT9 + BRRWT10 + BRRWT11 + BRRWT12 + BRRWT13 + BRRWT14 + BRRWT15 + BRRWT16 + BRRWT17 + BRRWT18 + BRRWT19 + BRRWT20 + BRRWT21 + BRRWT22 + BRRWT23 + BRRWT24 + BRRWT25 + BRRWT26 + BRRWT27 + BRRWT28 + BRRWT29 + BRRWT30 + BRRWT31 + BRRWT32 + BRRWT33 + BRRWT34 + BRRWT35 + BRRWT36 + BRRWT37 + BRRWT38 + BRRWT39 + BRRWT40 + BRRWT41 + BRRWT42 + BRRWT43 + BRRWT44 + BRRWT45 + BRRWT46 + BRRWT47 + BRRWT48 + BRRWT49 + BRRWT50 + BRRWT51 + \\n BRRWT52 + BRRWT53 + BRRWT54 + BRRWT55 + BRRWT56 + BRRWT57 + BRRWT58 + BRRWT59 + BRRWT60 + BRRWT61 + BRRWT62 + BRRWT63 + BRRWT64 + BRRWT65 + BRRWT66 + BRRWT67 + BRRWT68 + BRRWT69 + BRRWT70 + BRRWT71 + BRRWT72 + BRRWT73 + BRRWT74 + BRRWT75 + BRRWT76 + BRRWT77 + BRRWT78 + BRRWT79 + BRRWT80 + BRRWT81 + BRRWT82 + BRRWT83 + BRRWT84 + BRRWT85 + BRRWT86 + BRRWT87 + BRRWT88 + BRRWT89 + BRRWT90 + BRRWT91 + BRRWT92 + BRRWT93 + BRRWT94 + BRRWT95 + BRRWT96` ## - weights: NWEIGHT ## Data variables: DOEID (dbl), TOTALDOL (dbl), TOTSQFT_EN (dbl), REGIONC ## (dbl) summary(recs_des) ## Call: Called via srvyr ## Fay&#39;s variance method (rho= 0.5 ) with 96 replicates and MSE variances. ## Sampling variables: ## - repweights: `BRRWT1 + BRRWT2 + BRRWT3 + BRRWT4 + BRRWT5 + BRRWT6 + BRRWT7 + BRRWT8 + BRRWT9 + BRRWT10 + BRRWT11 + BRRWT12 + BRRWT13 + BRRWT14 + BRRWT15 + BRRWT16 + BRRWT17 + BRRWT18 + BRRWT19 + BRRWT20 + BRRWT21 + BRRWT22 + BRRWT23 + BRRWT24 + BRRWT25 + BRRWT26 + BRRWT27 + BRRWT28 + BRRWT29 + BRRWT30 + BRRWT31 + BRRWT32 + BRRWT33 + BRRWT34 + BRRWT35 + BRRWT36 + BRRWT37 + BRRWT38 + BRRWT39 + BRRWT40 + BRRWT41 + BRRWT42 + BRRWT43 + BRRWT44 + BRRWT45 + BRRWT46 + BRRWT47 + BRRWT48 + BRRWT49 + BRRWT50 + BRRWT51 + \\n BRRWT52 + BRRWT53 + BRRWT54 + BRRWT55 + BRRWT56 + BRRWT57 + BRRWT58 + BRRWT59 + BRRWT60 + BRRWT61 + BRRWT62 + BRRWT63 + BRRWT64 + BRRWT65 + BRRWT66 + BRRWT67 + BRRWT68 + BRRWT69 + BRRWT70 + BRRWT71 + BRRWT72 + BRRWT73 + BRRWT74 + BRRWT75 + BRRWT76 + BRRWT77 + BRRWT78 + BRRWT79 + BRRWT80 + BRRWT81 + BRRWT82 + BRRWT83 + BRRWT84 + BRRWT85 + BRRWT86 + BRRWT87 + BRRWT88 + BRRWT89 + BRRWT90 + BRRWT91 + BRRWT92 + BRRWT93 + BRRWT94 + BRRWT95 + BRRWT96` ## - weights: NWEIGHT ## Data variables: DOEID (dbl), TOTALDOL (dbl), TOTSQFT_EN (dbl), REGIONC ## (dbl) ## Variables: ## [1] &quot;DOEID&quot; &quot;TOTALDOL&quot; &quot;TOTSQFT_EN&quot; &quot;REGIONC&quot; In specifying the design, the variables option was also used to include which variables might be used in analyses. This is optional but can make your object smaller. When printing the design object or looking at the summary, the replicate weight type is re-iterated as Fay's variance method (rho= 0.5) with 96 replicates and MSE variances, and the variables are included. No weight or probability summary is included in this output as we have seen in some other design objects. 3.3.3 Jackknife method There are three jackknife estimators implemented in {srvyr} - Jackknife 1 (JK1), Jackknife n (JKn), and Jackknife 2 (JK2). The JK1 method can be used for unstratified designs, and replicates are created by removing one PSU at a time so the number of replicates is the same as the number of PSUs. If there is no clustering, then the PSU is the ultimate sampling unit (e.g., unit). The JKn method is used for stratified designs and requires two or more PSUs per stratum. In this case, each replicate is created by deleting one PSU from one stratum, so the number of replicates is the number of total PSUs across all strata. The JK2 method is a special case of JKn when there are exactly 2 PSUs sampled per stratum. For variance estimation, scaling constants must also be specified. The math For the JK1 method, the standard error estimate for \\(\\hat{\\theta}\\) is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\frac{R-1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The JKn method is a bit more complex, but the coefficients are generally provided with restricted and public-use files. For each replicate, one stratum has a PSU removed, and the weights are adjusted by \\(n_h/(n_h-1)\\) where \\(n_h\\) is the number of PSUs in the stratum. The coefficients in other strata are set to 1. Denote the coefficient that results from this process for replicate \\(r\\) as \\(\\alpha_r\\), then the standard error estimate for \\(\\hat{\\theta}\\) is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\sum_{r=1}^R \\left(\\alpha_r \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The syntax To specify the Jackknife method, the type would be JK1, JKn, or JK2. Additionally, the overall multiplier for JK1 is specified with the scale argument, whereas the replicate-specific multiplier (\\(\\alpha_r\\)) is specified with the scales argument. Consider a case for the JK1 method where the multiplier, \\((R-1)/R=19/20=0.95\\) and the dataset had WT0 for the main weight and had 20 JK1 weights indicated WT1, WT2, …, WT20, then the syntax would be jk1_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;JK1&quot;, mse=TRUE, scale=0.95) Consider a case for the JKn method where \\(\\alpha_r=0.1\\) for all replicates and the dataset had WT0 for the main weight and had 20 JK1 weights indicated as WT1, WT2, …, WT20, then the syntax would be: jkn_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;JKN&quot;, mse=TRUE, rscales=rep(0.1, 20)) Example The American Community Survey releases public use microdata with JK1 weights at the person and household level. This example includes data at the household level where the replicate weights are specified as WGTP1, …, WGTP80, and the main weight is WGTP (U.S. Census Bureau 2023). Using the {tidycensus} package7, data is downloaded from the Census API. For example, the code below has a request to obtain data for each person in each household in two Public Use Microdata Areas (PUMAs) in Durham County, NC8. The variables requested are NP (number of persons in the household), BDSP (number of bedrooms), HINCP (household income), and TYPEHUGQ (type of household). By default, several other variables will come along, including SERIALNO (a unique identifier for each household), SPORDER (a unique identifier for each person within each household), PUMA, ST (state), person weight (PWGTP), and the household weights (WGTP, WGTP1, …, WGTP80). Filtering to records where SPORDER=1 yields only one record per household and TYPEHUGQ=1 filters to only households and not group quarters. pums_in &lt;- get_pums( variables = c(&quot;NP&quot;, &quot;BDSP&quot;, &quot;HINCP&quot;), state = &quot;37&quot;, puma = c(&quot;01301&quot;, &quot;01302&quot;), rep_weights = &quot;housing&quot;, year = 2021, survey = &quot;acs5&quot;, variables_filter = list(SPORDER = 1, TYPEHUGQ = 1) ) ## Getting data from the 2017-2021 5-year ACS Public Use Microdata Sample pums_in ## # A tibble: 5,017 × 90 ## SERIALNO SPORDER NP BDSP HINCP PUMA ST TYPEHUGQ WGTP PWGTP ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 20170000… 1 2 1 37000 01301 37 1 15 15 ## 2 20170000… 1 3 3 81500 01302 37 1 18 19 ## 3 20170000… 1 3 3 104000 01301 37 1 21 21 ## 4 20170000… 1 1 3 8400 01302 37 1 48 48 ## 5 20170000… 1 4 4 139000 01302 37 1 22 23 ## 6 20170000… 1 1 1 18900 01302 37 1 33 34 ## 7 20170000… 1 1 0 8800 01302 37 1 54 54 ## 8 20170000… 1 1 2 24700 01302 37 1 9 9 ## 9 20170000… 1 3 4 227000 01301 37 1 11 11 ## 10 20170000… 1 2 2 87000 01302 37 1 14 13 ## # ℹ 5,007 more rows ## # ℹ 80 more variables: WGTP1 &lt;dbl&gt;, WGTP2 &lt;dbl&gt;, WGTP3 &lt;dbl&gt;, ## # WGTP4 &lt;dbl&gt;, WGTP5 &lt;dbl&gt;, WGTP6 &lt;dbl&gt;, WGTP7 &lt;dbl&gt;, WGTP8 &lt;dbl&gt;, ## # WGTP9 &lt;dbl&gt;, WGTP10 &lt;dbl&gt;, WGTP11 &lt;dbl&gt;, WGTP12 &lt;dbl&gt;, ## # WGTP13 &lt;dbl&gt;, WGTP14 &lt;dbl&gt;, WGTP15 &lt;dbl&gt;, WGTP16 &lt;dbl&gt;, ## # WGTP17 &lt;dbl&gt;, WGTP18 &lt;dbl&gt;, WGTP19 &lt;dbl&gt;, WGTP20 &lt;dbl&gt;, ## # WGTP21 &lt;dbl&gt;, WGTP22 &lt;dbl&gt;, WGTP23 &lt;dbl&gt;, WGTP24 &lt;dbl&gt;, … acs_des &lt;- pums_in %&gt;% as_survey_rep( weights = WGTP, repweights = num_range(&quot;WGTP&quot;, 1:80), type = &quot;JK1&quot;, mse = TRUE, scale = 4 / 80 ) acs_des ## Call: Called via srvyr ## Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances. ## Sampling variables: ## - repweights: `WGTP1 + WGTP2 + WGTP3 + WGTP4 + WGTP5 + WGTP6 + WGTP7 + WGTP8 + WGTP9 + WGTP10 + WGTP11 + WGTP12 + WGTP13 + WGTP14 + WGTP15 + WGTP16 + WGTP17 + WGTP18 + WGTP19 + WGTP20 + WGTP21 + WGTP22 + WGTP23 + WGTP24 + WGTP25 + WGTP26 + WGTP27 + WGTP28 + WGTP29 + WGTP30 + WGTP31 + WGTP32 + WGTP33 + WGTP34 + WGTP35 + WGTP36 + WGTP37 + WGTP38 + WGTP39 + WGTP40 + WGTP41 + WGTP42 + WGTP43 + WGTP44 + WGTP45 + WGTP46 + WGTP47 + WGTP48 + WGTP49 + WGTP50 + WGTP51 + WGTP52 + WGTP53 + WGTP54 + WGTP55 + WGTP56 + WGTP57 + \\n WGTP58 + WGTP59 + WGTP60 + WGTP61 + WGTP62 + WGTP63 + WGTP64 + WGTP65 + WGTP66 + WGTP67 + WGTP68 + WGTP69 + WGTP70 + WGTP71 + WGTP72 + WGTP73 + WGTP74 + WGTP75 + WGTP76 + WGTP77 + WGTP78 + WGTP79 + WGTP80` ## - weights: WGTP ## Data variables: SERIALNO (chr), SPORDER (dbl), NP (dbl), BDSP (dbl), ## HINCP (dbl), PUMA (chr), ST (chr), TYPEHUGQ (chr), WGTP (dbl), PWGTP ## (dbl), WGTP1 (dbl), WGTP2 (dbl), WGTP3 (dbl), WGTP4 (dbl), WGTP5 ## (dbl), WGTP6 (dbl), WGTP7 (dbl), WGTP8 (dbl), WGTP9 (dbl), WGTP10 ## (dbl), WGTP11 (dbl), WGTP12 (dbl), WGTP13 (dbl), WGTP14 (dbl), WGTP15 ## (dbl), WGTP16 (dbl), WGTP17 (dbl), WGTP18 (dbl), WGTP19 (dbl), WGTP20 ## (dbl), WGTP21 (dbl), WGTP22 (dbl), WGTP23 (dbl), WGTP24 (dbl), WGTP25 ## (dbl), WGTP26 (dbl), WGTP27 (dbl), WGTP28 (dbl), WGTP29 (dbl), WGTP30 ## (dbl), WGTP31 (dbl), WGTP32 (dbl), WGTP33 (dbl), WGTP34 (dbl), WGTP35 ## (dbl), WGTP36 (dbl), WGTP37 (dbl), WGTP38 (dbl), WGTP39 (dbl), WGTP40 ## (dbl), WGTP41 (dbl), WGTP42 (dbl), WGTP43 (dbl), WGTP44 (dbl), WGTP45 ## (dbl), WGTP46 (dbl), WGTP47 (dbl), WGTP48 (dbl), WGTP49 (dbl), WGTP50 ## (dbl), WGTP51 (dbl), WGTP52 (dbl), WGTP53 (dbl), WGTP54 (dbl), WGTP55 ## (dbl), WGTP56 (dbl), WGTP57 (dbl), WGTP58 (dbl), WGTP59 (dbl), WGTP60 ## (dbl), WGTP61 (dbl), WGTP62 (dbl), WGTP63 (dbl), WGTP64 (dbl), WGTP65 ## (dbl), WGTP66 (dbl), WGTP67 (dbl), WGTP68 (dbl), WGTP69 (dbl), WGTP70 ## (dbl), WGTP71 (dbl), WGTP72 (dbl), WGTP73 (dbl), WGTP74 (dbl), WGTP75 ## (dbl), WGTP76 (dbl), WGTP77 (dbl), WGTP78 (dbl), WGTP79 (dbl), WGTP80 ## (dbl) summary(acs_des) ## Call: Called via srvyr ## Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances. ## Sampling variables: ## - repweights: `WGTP1 + WGTP2 + WGTP3 + WGTP4 + WGTP5 + WGTP6 + WGTP7 + WGTP8 + WGTP9 + WGTP10 + WGTP11 + WGTP12 + WGTP13 + WGTP14 + WGTP15 + WGTP16 + WGTP17 + WGTP18 + WGTP19 + WGTP20 + WGTP21 + WGTP22 + WGTP23 + WGTP24 + WGTP25 + WGTP26 + WGTP27 + WGTP28 + WGTP29 + WGTP30 + WGTP31 + WGTP32 + WGTP33 + WGTP34 + WGTP35 + WGTP36 + WGTP37 + WGTP38 + WGTP39 + WGTP40 + WGTP41 + WGTP42 + WGTP43 + WGTP44 + WGTP45 + WGTP46 + WGTP47 + WGTP48 + WGTP49 + WGTP50 + WGTP51 + WGTP52 + WGTP53 + WGTP54 + WGTP55 + WGTP56 + WGTP57 + \\n WGTP58 + WGTP59 + WGTP60 + WGTP61 + WGTP62 + WGTP63 + WGTP64 + WGTP65 + WGTP66 + WGTP67 + WGTP68 + WGTP69 + WGTP70 + WGTP71 + WGTP72 + WGTP73 + WGTP74 + WGTP75 + WGTP76 + WGTP77 + WGTP78 + WGTP79 + WGTP80` ## - weights: WGTP ## Data variables: SERIALNO (chr), SPORDER (dbl), NP (dbl), BDSP (dbl), ## HINCP (dbl), PUMA (chr), ST (chr), TYPEHUGQ (chr), WGTP (dbl), PWGTP ## (dbl), WGTP1 (dbl), WGTP2 (dbl), WGTP3 (dbl), WGTP4 (dbl), WGTP5 ## (dbl), WGTP6 (dbl), WGTP7 (dbl), WGTP8 (dbl), WGTP9 (dbl), WGTP10 ## (dbl), WGTP11 (dbl), WGTP12 (dbl), WGTP13 (dbl), WGTP14 (dbl), WGTP15 ## (dbl), WGTP16 (dbl), WGTP17 (dbl), WGTP18 (dbl), WGTP19 (dbl), WGTP20 ## (dbl), WGTP21 (dbl), WGTP22 (dbl), WGTP23 (dbl), WGTP24 (dbl), WGTP25 ## (dbl), WGTP26 (dbl), WGTP27 (dbl), WGTP28 (dbl), WGTP29 (dbl), WGTP30 ## (dbl), WGTP31 (dbl), WGTP32 (dbl), WGTP33 (dbl), WGTP34 (dbl), WGTP35 ## (dbl), WGTP36 (dbl), WGTP37 (dbl), WGTP38 (dbl), WGTP39 (dbl), WGTP40 ## (dbl), WGTP41 (dbl), WGTP42 (dbl), WGTP43 (dbl), WGTP44 (dbl), WGTP45 ## (dbl), WGTP46 (dbl), WGTP47 (dbl), WGTP48 (dbl), WGTP49 (dbl), WGTP50 ## (dbl), WGTP51 (dbl), WGTP52 (dbl), WGTP53 (dbl), WGTP54 (dbl), WGTP55 ## (dbl), WGTP56 (dbl), WGTP57 (dbl), WGTP58 (dbl), WGTP59 (dbl), WGTP60 ## (dbl), WGTP61 (dbl), WGTP62 (dbl), WGTP63 (dbl), WGTP64 (dbl), WGTP65 ## (dbl), WGTP66 (dbl), WGTP67 (dbl), WGTP68 (dbl), WGTP69 (dbl), WGTP70 ## (dbl), WGTP71 (dbl), WGTP72 (dbl), WGTP73 (dbl), WGTP74 (dbl), WGTP75 ## (dbl), WGTP76 (dbl), WGTP77 (dbl), WGTP78 (dbl), WGTP79 (dbl), WGTP80 ## (dbl) ## Variables: ## [1] &quot;SERIALNO&quot; &quot;SPORDER&quot; &quot;NP&quot; &quot;BDSP&quot; &quot;HINCP&quot; &quot;PUMA&quot; ## [7] &quot;ST&quot; &quot;TYPEHUGQ&quot; &quot;WGTP&quot; &quot;PWGTP&quot; &quot;WGTP1&quot; &quot;WGTP2&quot; ## [13] &quot;WGTP3&quot; &quot;WGTP4&quot; &quot;WGTP5&quot; &quot;WGTP6&quot; &quot;WGTP7&quot; &quot;WGTP8&quot; ## [19] &quot;WGTP9&quot; &quot;WGTP10&quot; &quot;WGTP11&quot; &quot;WGTP12&quot; &quot;WGTP13&quot; &quot;WGTP14&quot; ## [25] &quot;WGTP15&quot; &quot;WGTP16&quot; &quot;WGTP17&quot; &quot;WGTP18&quot; &quot;WGTP19&quot; &quot;WGTP20&quot; ## [31] &quot;WGTP21&quot; &quot;WGTP22&quot; &quot;WGTP23&quot; &quot;WGTP24&quot; &quot;WGTP25&quot; &quot;WGTP26&quot; ## [37] &quot;WGTP27&quot; &quot;WGTP28&quot; &quot;WGTP29&quot; &quot;WGTP30&quot; &quot;WGTP31&quot; &quot;WGTP32&quot; ## [43] &quot;WGTP33&quot; &quot;WGTP34&quot; &quot;WGTP35&quot; &quot;WGTP36&quot; &quot;WGTP37&quot; &quot;WGTP38&quot; ## [49] &quot;WGTP39&quot; &quot;WGTP40&quot; &quot;WGTP41&quot; &quot;WGTP42&quot; &quot;WGTP43&quot; &quot;WGTP44&quot; ## [55] &quot;WGTP45&quot; &quot;WGTP46&quot; &quot;WGTP47&quot; &quot;WGTP48&quot; &quot;WGTP49&quot; &quot;WGTP50&quot; ## [61] &quot;WGTP51&quot; &quot;WGTP52&quot; &quot;WGTP53&quot; &quot;WGTP54&quot; &quot;WGTP55&quot; &quot;WGTP56&quot; ## [67] &quot;WGTP57&quot; &quot;WGTP58&quot; &quot;WGTP59&quot; &quot;WGTP60&quot; &quot;WGTP61&quot; &quot;WGTP62&quot; ## [73] &quot;WGTP63&quot; &quot;WGTP64&quot; &quot;WGTP65&quot; &quot;WGTP66&quot; &quot;WGTP67&quot; &quot;WGTP68&quot; ## [79] &quot;WGTP69&quot; &quot;WGTP70&quot; &quot;WGTP71&quot; &quot;WGTP72&quot; &quot;WGTP73&quot; &quot;WGTP74&quot; ## [85] &quot;WGTP75&quot; &quot;WGTP76&quot; &quot;WGTP77&quot; &quot;WGTP78&quot; &quot;WGTP79&quot; &quot;WGTP80&quot; When printing the design object or looking at the summary, the replicate weight type is re-iterated as Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances, and the variables are included. No weight or probability summary is included. 3.3.4 Bootstrap Method In bootstrap resampling, replicates are created by selecting random samples of the PSUs with replacement (SRSWR). If there are \\(M\\) PSUs in the sample, then each replicate will be created by selecting a random sample of \\(M\\) PSUs with replacement. Each replicate is created independently, and the weights for each replicate are adjusted to reflect the population, generally using the same method as how the analysis weight was adjusted. The math A weighted estimate for the full sample is calculated as \\(\\hat{\\theta}\\), and then a weighted estimate for each replicate is calculated as \\(\\hat{\\theta}_r\\) for \\(R\\) replicates. Then the standard error of the estimate is calculated as follows: \\[se(\\hat{\\theta})=\\sqrt{\\alpha \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] where \\(\\alpha\\) is the scaling constant. The syntax If a dataset had WT0 for the main weight, 20 bootstrap weights indicated WT1, WT2, …, WT20, and \\(\\alpha=.02\\), use the following syntax: bs_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;bootstrap&quot;, mse=TRUE, scale=.02) Note that the scale (\\(\\alpha\\)) is usually provided in the documentation and is a constant, so it is not provided as a variable in the tibble. Example Returning to the api example, we are going to create a dataset with bootstrap weights to use as an example. In this example, we construct a one-cluster design with fifty replicate weights. apiclus1_slim &lt;- apiclus1 %&gt;% as_tibble() %&gt;% arrange(dnum) %&gt;% select(cds, dnum, fpc, pw) set.seed(662152) apibw &lt;- bootweights( psu = apiclus1_slim$dnum, strata = rep(1, nrow(apiclus1_slim)), fpc = apiclus1_slim$fpc, replicates = 50 ) bwmata &lt;- apibw$repweights$weights[apibw$repweights$index,] * apiclus1_slim$pw apiclus1_slim &lt;- bwmata %&gt;% as.data.frame() %&gt;% set_names(str_c(&quot;pw&quot;, 1:50)) %&gt;% cbind(apiclus1_slim) %&gt;% as_tibble() %&gt;% select(cds, dnum, fpc, pw, everything()) apiclus1_slim ## # A tibble: 183 × 54 ## cds dnum fpc pw pw1 pw2 pw3 pw4 pw5 pw6 pw7 ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 2 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 3 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 4 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 5 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 6 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 7 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 8 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 9 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 10 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## # ℹ 173 more rows ## # ℹ 43 more variables: pw8 &lt;dbl&gt;, pw9 &lt;dbl&gt;, pw10 &lt;dbl&gt;, pw11 &lt;dbl&gt;, ## # pw12 &lt;dbl&gt;, pw13 &lt;dbl&gt;, pw14 &lt;dbl&gt;, pw15 &lt;dbl&gt;, pw16 &lt;dbl&gt;, ## # pw17 &lt;dbl&gt;, pw18 &lt;dbl&gt;, pw19 &lt;dbl&gt;, pw20 &lt;dbl&gt;, pw21 &lt;dbl&gt;, ## # pw22 &lt;dbl&gt;, pw23 &lt;dbl&gt;, pw24 &lt;dbl&gt;, pw25 &lt;dbl&gt;, pw26 &lt;dbl&gt;, ## # pw27 &lt;dbl&gt;, pw28 &lt;dbl&gt;, pw29 &lt;dbl&gt;, pw30 &lt;dbl&gt;, pw31 &lt;dbl&gt;, ## # pw32 &lt;dbl&gt;, pw33 &lt;dbl&gt;, pw34 &lt;dbl&gt;, pw35 &lt;dbl&gt;, pw36 &lt;dbl&gt;, … The output of apiclus1_slim includes the same variables we have seen in other api examples (see Table 3.1), but now additionally includes bootstrap weights pw1, …, pw50. When creating the survey design object, we use the bootstrap weights as the replicate weights. Additionally, with replicate weights we need to include the scale (\\(\\alpha\\)). For this example we created, \\(\\alpha\\) is \\(15/(14*49)=0.02186589\\). api1_bs_des &lt;- apiclus1_slim %&gt;% as_survey_rep( weights = pw, repweights = pw1:pw50, type = &quot;bootstrap&quot;, scale = 0.02186589, mse = TRUE ) api1_bs_des ## Call: Called via srvyr ## Survey bootstrap with 50 replicates and MSE variances. ## Sampling variables: ## - repweights: `pw1 + pw2 + pw3 + pw4 + pw5 + pw6 + pw7 + pw8 + pw9 + pw10 + pw11 + pw12 + pw13 + pw14 + pw15 + pw16 + pw17 + pw18 + pw19 + pw20 + pw21 + pw22 + pw23 + pw24 + pw25 + pw26 + pw27 + pw28 + pw29 + pw30 + pw31 + pw32 + pw33 + pw34 + pw35 + pw36 + pw37 + pw38 + pw39 + pw40 + pw41 + pw42 + pw43 + pw44 + pw45 + pw46 + pw47 + pw48 + pw49 + pw50` ## - weights: pw ## Data variables: cds (chr), dnum (int), fpc (dbl), pw (dbl), pw1 (dbl), ## pw2 (dbl), pw3 (dbl), pw4 (dbl), pw5 (dbl), pw6 (dbl), pw7 (dbl), pw8 ## (dbl), pw9 (dbl), pw10 (dbl), pw11 (dbl), pw12 (dbl), pw13 (dbl), ## pw14 (dbl), pw15 (dbl), pw16 (dbl), pw17 (dbl), pw18 (dbl), pw19 ## (dbl), pw20 (dbl), pw21 (dbl), pw22 (dbl), pw23 (dbl), pw24 (dbl), ## pw25 (dbl), pw26 (dbl), pw27 (dbl), pw28 (dbl), pw29 (dbl), pw30 ## (dbl), pw31 (dbl), pw32 (dbl), pw33 (dbl), pw34 (dbl), pw35 (dbl), ## pw36 (dbl), pw37 (dbl), pw38 (dbl), pw39 (dbl), pw40 (dbl), pw41 ## (dbl), pw42 (dbl), pw43 (dbl), pw44 (dbl), pw45 (dbl), pw46 (dbl), ## pw47 (dbl), pw48 (dbl), pw49 (dbl), pw50 (dbl) summary(api1_bs_des) ## Call: Called via srvyr ## Survey bootstrap with 50 replicates and MSE variances. ## Sampling variables: ## - repweights: `pw1 + pw2 + pw3 + pw4 + pw5 + pw6 + pw7 + pw8 + pw9 + pw10 + pw11 + pw12 + pw13 + pw14 + pw15 + pw16 + pw17 + pw18 + pw19 + pw20 + pw21 + pw22 + pw23 + pw24 + pw25 + pw26 + pw27 + pw28 + pw29 + pw30 + pw31 + pw32 + pw33 + pw34 + pw35 + pw36 + pw37 + pw38 + pw39 + pw40 + pw41 + pw42 + pw43 + pw44 + pw45 + pw46 + pw47 + pw48 + pw49 + pw50` ## - weights: pw ## Data variables: cds (chr), dnum (int), fpc (dbl), pw (dbl), pw1 (dbl), ## pw2 (dbl), pw3 (dbl), pw4 (dbl), pw5 (dbl), pw6 (dbl), pw7 (dbl), pw8 ## (dbl), pw9 (dbl), pw10 (dbl), pw11 (dbl), pw12 (dbl), pw13 (dbl), ## pw14 (dbl), pw15 (dbl), pw16 (dbl), pw17 (dbl), pw18 (dbl), pw19 ## (dbl), pw20 (dbl), pw21 (dbl), pw22 (dbl), pw23 (dbl), pw24 (dbl), ## pw25 (dbl), pw26 (dbl), pw27 (dbl), pw28 (dbl), pw29 (dbl), pw30 ## (dbl), pw31 (dbl), pw32 (dbl), pw33 (dbl), pw34 (dbl), pw35 (dbl), ## pw36 (dbl), pw37 (dbl), pw38 (dbl), pw39 (dbl), pw40 (dbl), pw41 ## (dbl), pw42 (dbl), pw43 (dbl), pw44 (dbl), pw45 (dbl), pw46 (dbl), ## pw47 (dbl), pw48 (dbl), pw49 (dbl), pw50 (dbl) ## Variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;fpc&quot; &quot;pw&quot; &quot;pw1&quot; &quot;pw2&quot; &quot;pw3&quot; &quot;pw4&quot; &quot;pw5&quot; ## [10] &quot;pw6&quot; &quot;pw7&quot; &quot;pw8&quot; &quot;pw9&quot; &quot;pw10&quot; &quot;pw11&quot; &quot;pw12&quot; &quot;pw13&quot; &quot;pw14&quot; ## [19] &quot;pw15&quot; &quot;pw16&quot; &quot;pw17&quot; &quot;pw18&quot; &quot;pw19&quot; &quot;pw20&quot; &quot;pw21&quot; &quot;pw22&quot; &quot;pw23&quot; ## [28] &quot;pw24&quot; &quot;pw25&quot; &quot;pw26&quot; &quot;pw27&quot; &quot;pw28&quot; &quot;pw29&quot; &quot;pw30&quot; &quot;pw31&quot; &quot;pw32&quot; ## [37] &quot;pw33&quot; &quot;pw34&quot; &quot;pw35&quot; &quot;pw36&quot; &quot;pw37&quot; &quot;pw38&quot; &quot;pw39&quot; &quot;pw40&quot; &quot;pw41&quot; ## [46] &quot;pw42&quot; &quot;pw43&quot; &quot;pw44&quot; &quot;pw45&quot; &quot;pw46&quot; &quot;pw47&quot; &quot;pw48&quot; &quot;pw49&quot; &quot;pw50&quot; As with other replicate design objects, when printing the object or looking at the summary, the replicate weights are provided along with the data variables. 3.4 Understanding survey design documentation SRS, stratified, and clustered designs are the backbone of sampling designs, and the features are often combined in one design. Additionally, rather than using SRS for selection, other sampling mechanisms are commonly used, such as probability proportional to size (PPS), systematic sampling, or selection with unequal probabilities, which are briefly described here. In PPS sampling, a size measure is constructed for each unit (e.g., the population of the PSU or the number of occupied housing units) and then units with larger size measures are more likely to be sampled. Systematic sampling is commonly used to ensure representation across a population. Units are sorted by a feature and then every \\(k\\) units are selected from a random start point so the sample is spread across the population. In addition to PPS, other unequal probabilities of selection may be used. For example, in a study of establishments (e.g., businesses or public institutions) that conducts a survey every year, an establishment that recently participated (e.g., participated last year) may have a reduced chance of selection in a subsequent round to reduce the burden on the establishment. To learn more about sampling designs, refer to Valliant, Dever, and Kreuter (2013), Cox et al. (2011), Cochran (1977), and Deming (1991). A common method of sampling is to stratify PSUs, select PSUs within the stratum using PPS selection, and then select units within the PSUs either with SRS or PPS. Reading survey documentation is an important first step in survey analysis to understand the design of the survey you are using and variables necessary to specify the design. Good documentation will highlight the variables necessary to specify the design. This is often found in User’s Guides, methodology, analysis guides, or technical documentation (see Chapter 4 for more details). Example For example, the 2017-2019 National Survey of Family Growth (NSFG)9 had a stratified multi-stage area probability sample. In the first stage, PSUs are counties or collections of counties and are stratified by Census region/division, size (population), and MSA status. Within each stratum, PSUs were selected via PPS. In the second stage, neighborhoods were selected within the sampled PSUs using PPS selection. In the third stage, housing units were selected within the sampled neighborhoods. In the fourth stage, a person was randomly chosen within the selected housing units among eligible persons using unequal probabilities based on the person’s age and sex. The public use file does not include all these levels of selection and instead has pseudo-strata and pseudo-clusters, which are the variables used in R to specify the design. As specified on page 4 of the documentation, the stratum variable is SEST, the cluster variable is SECU, and the weight variable is WGT2017_2019. Thus, to specify this design in R, use the following syntax: nsfg_des &lt;- nsfgdata %&gt;% as_survey_design(ids = SECU, strata = SEST, weights = WGT2017_2019) 3.5 Exercises The American National Election Studies (ANES) collect data before and after elections approximately every four years around the presidential election cycle. Each year with the data release, a user’s guide is also released10. What is the syntax for specifying the analysis of the full sample post-election data? anes_des &lt;- anes_data %&gt;% as_survey_design(weight) The General Social Survey is a survey that has been administered since 1972 on social, behavioral, and attitudinal topics. The 2016-2020 GSS Panel codebook11 provides examples of setting up syntax in SAS and Stata but not R. How would you specify the design in R? gss_des &lt;- gss_data %&gt;% as_survey_design(ids = VPSU_2, strata = VSTRAT_2, weights = WTSSNR_2) References "],["c04-understanding-survey-data-documentation.html", "Chapter 4 Understanding survey data documentation 4.1 Introduction 4.2 Types of survey documentation 4.3 Working with missing data 4.4 Example: American National Election Studies (ANES) 2020 survey documentation 4.5 Searching for public-use survey data", " Chapter 4 Understanding survey data documentation Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(censusapi) library(survey) library(srvyr) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES. Here is the code to read in the data. anes_raw &lt;- read_osf(&quot;anes_2020.rds&quot;) 4.1 Introduction Before diving into survey analysis, it’s crucial to review the survey documentation thoroughly. This documentation includes technical guides, questionnaires, codebooks, errata, and other useful resources. By taking the time to review these materials, we can gain a comprehensive understanding of the survey data (including research and design decisions discussed in Chapters 2 and 3) and effectively conduct our analysis. Survey documentation can vary in organization, type, and ease of use. The information may be stored in any format - PDFs, Excel spreadsheets, Word documents, etc. Some surveys save different documentation together, such as providing a single document containing both the codebook and the questionnaire. Others keep them in separate files. Despite these differences, it’s important to know what kind of information is available in each documentation type and what to focus on in each one. 4.2 Types of survey documentation 4.2.1 Technical documentation The technical documentation, also known as user guides or methodology/analysis guides, highlights the variables necessary to specify the survey design. We recommend focusing on these key sections: Introduction: The introduction orients us to the survey. This section provides the project’s background, the study’s purpose, and the main research questions. Study design: The study design section describes how researchers prepared and administered the survey. Sample: The sample section describes how researchers selected cases, any sampling error that occurred, and the limitations of the sample. This section can contain recommendations on how to use sampling weights. Look for weight information, whether the survey design is strata and/or clusters/PSUs or replicate weights, and any population sizes, finite population correction, or replicate weight scaling information. This documentation is critical in successfully running our analysis, and more detail on sample designs is available in Chapter 3. The technical documentation may include other helpful information. Some technical documentation includes syntax for SAS, SUDAAN, Stata, and/or R, meaning we don’t have to create this code from scratch. 4.2.2 Questionnaires A questionnaire is a series of questions asked to obtain information from survey respondents. A questionnaire gathers opinions, behaviors, or demographic data by employing different types of questions, such as closed-ended (e.g., radio button select one or check all that apply), open-ended (e.g., numeric or text), Likert scales, or ranking questions. It may randomize the display order of responses or include instructions to help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope. The questionnaire is an essential resource for understanding and interpreting the survey data (see Section 2.2.3), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (if only a subset of respondents were asked the question). Below in Figure 4.1, we show a question from the ANES 2020 questionnaire (American National Election Studies 2021). This figure shows a particular question’s question name (postvote_rvote), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if vote_pre = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. FIGURE 4.1: ANES 2020 Questionnaire Example The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure 4.2 shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) (Centers for Disease Control and Prevention (CDC) 2021). FIGURE 4.2: BRFSS 2021 Questionnaire Example Given the variety in how the survey information is presented in documentation, it is essential to consider the specific survey when interpreting the information presented in a questionnaire. For example, surveys that use different modes (e.g., web and mail) may have different question wording or skip logic as a web survey can include fills or automate skip logic. Reviewing the questionnaire documentation for the specific survey is crucial in understanding how to interpret the data and findings. 4.2.3 Codebooks While a questionnaire provides information about the questions asked to respondents, the codebook explains how the survey data was coded and recorded. The codebook lists details such as variable names, variable labels, variable meanings, codes for missing data, values labels, and value types (whether categorical or continuous, etc.). In particular, the codebook often includes information on missing data (as opposed to the questionnaire). The codebook enables us to understand and use the variables appropriately in our analysis. Figure 4.3 is a question from the ANES 2020 codebook (American National Election Studies 2022). This part indicates a particular variable’s name (V202066), question wording, value labels, universe, and associated survey question (postvote_rvote). FIGURE 4.3: ANES 2020 Codebook Example Reviewing both questionnaires and codebooks in parallel is important (Figures 4.1 and 4.3, as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables. 4.2.4 Errata An erratum (singular) or errata (plural) is a document that lists errors found in a publication or dataset, such as a survey questionnaire. The purpose of an erratum is to correct or update mistakes or inaccuracies in the original document. For example, if a survey questionnaire contains an error, such as a typo or confusing wording, the researchers would release an erratum that provides a corrected version. Another type of erratum is incorrectly programmed skips in an electronic survey where questions are skipped by the respondent when they should not have been. Review these errata before conducting any analysis to ensure the accuracy and reliability of the survey data and analysis. 4.2.5 Additional resources Surveys may have additional resources, such as interviewer instructions or “show cards” provided to respondents during interviewer administed surveys to help respondents answer questions. Explore the survey website to find out what resources were used and in what contexts. 4.3 Working with missing data Missing data in surveys refers to situations where participants do not provide complete responses to survey questions. Respondents may not have seen a question by design. Or, they may not respond to a question for various other reasons, such as not wanting to answer a particular question, not understanding the question, or simply forgetting to answer. Missing data can be a significant problem in survey analysis, as it can introduce bias and reduce the representativeness of the data. Missing data typically falls into two main categories, either missing by design or unintentional missing mechanisms. Missing by design/questionnaire skip logic: This type of missingness occurs when certain respondents are intentionally directed to skip specific questions based on their previous responses or characteristics. For example, in a survey about employment, if a respondent indicates that they are not employed, they may be directed to skip questions related to their job responsibilities. Unintentional missing data: This type of missingness occurs when researchers did not intend for there to be missing data on a particular question. For example, respondents did not finish the survey or refused to answer individual questions. There are 3 main types of unintentional missing data that each should be considered and may need to be handled differently (Mack, Su, and Westreich 2018; Schafer and Graham 2002): Missing completely at random (MCAR): The missing data is unrelated to both observed and unobserved data, and the probability of being missing is the same across all cases. For example, if a respondent missed a question because they had to leave the survey early due to an emergency. Missing at random (MAR): The missing data is related to observed data but not unobserved data, and the probability of being missing is the same within groups. For example, if older respondents choose not to answer specific questions but younger respondents do answer them, and know the respondent’s age. Missing not at random (MNAR): The missing data is related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have “Yes” responses coded to 1, “No” responses coded to 2, and missing responses coded to -9. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable V202066 from the ANES (Figure 4.3), -9 represents “Refused,” -7 means that the response was deleted due to an incomplete interview, -6 means that there is no response because there was no follow-up interview, and -1 means “Inapplicable” (due to the designed skip pattern). When running analysis in R, we must handle missing responses as missing data (i.e., NA) and not numeric data. If missing responses are treated as zeros or arbitrary values, they can artificially alter summary statistics or introduce spurious patterns in the analysis. Recoding these values to NA will allow you to handle missing data in different ways in R, such as using functions like na.omit(), complete.cases(), or specialized packages like {tidyimpute} or {mice}. These tools allow us to treat missing responses as missing data to conduct your analysis accurately and obtain valid results. Visualizing the missing data can also help to inform the types of missing data that are present. The {naniar} package provides many valuable missing data visualizations, such as using gg_miss_var() to see the count or percent of missing data points by variable or gg_miss_fct() to see relationships in missing data across levels of a factor variable. Investigating the relationships and nature of the missing data before running models can ensure that the missing data is accurately accounted for. 4.3.1 Accounting for questionnaire skip patterns Questionnaires may include skip patterns, in which specific questions are skipped based on the respondent’s answers to earlier questions. For example, if a respondent answers “no” to a question on whether they voted in the last election, they may be instructed to skip a series of questions related to that election. Skip patterns are used in surveys to streamline the data collection process and avoid asking irrelevant questions to certain respondents. However, they also result in missing data, as respondents cannot respond to questions they were instructed to skip. Analyzing the data missing by design requires understanding the underlying reasons for the skip patterns. Our survey analysis must properly account for skip patterns to ensure unbiased and accurate population parameters. Dealing with missing data due to skip patterns requires careful consideration. We can treat skipped questions as missing data. Or, we can run an analysis that accounts for the conditional dependence between the skipped and answered questions. The appropriate method depends on the nature and extent of the skip patterns, the research questions, and the methodology. For example, if we wanted to know what proportion of eligible voters voted for a particular candidate, the denominator would be all eligible voters, while if we wanted to know what proportion voted for a specific candidate among those who voted, the denominator would be those who voted. We include or exclude missing values depending on our research question. 4.3.2 Accounting for Missing Completely at Random (MCAR), Missing at Random (MAR), and Missing not at Random (MNAR) missingness When dealing with missing data that is MCAR, MAR, or MNAR, we must consider the implications of how we handle these missing data and avoid introducing more sources of bias. For instance, we can analyze only the respondents who answered all questions by performing listwise deletion, which drops all rows from a data frame with a missing value in any column. We can use the function tidyr::drop_na() for listwise deletion. For example, let’s say we have a dataset dat that has one complete case and 2 cases with some missing data. dat &lt;- tibble::tribble(~ col1, ~ col2, ~ col3, &quot;a&quot;, &quot;d&quot;, &quot;e&quot;, &quot;b&quot;, NA, NA, &quot;c&quot;, NA, &quot;f&quot;) dat ## # A tibble: 3 × 3 ## col1 col2 col3 ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; ## 1 a d e ## 2 b &lt;NA&gt; &lt;NA&gt; ## 3 c &lt;NA&gt; f If we use the tidyr::drop_na() funtion, only the first case will remain as the other two cases have at least one missing value. dat %&gt;% tidyr::drop_na() ## # A tibble: 1 × 3 ## col1 col2 col3 ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; ## 1 a d e If the data is not missing completely at random (MCAR), then listwise deletion may produce biased estimates if there is a pattern of respondents who do not respond to specific questions. In these circumstances, we should explore other options, such as multiple imputation or weighted estimation. However, imputation is not always appropriate and can introduce its own sources of bias. See Allison (2002) for more detail. In summary, we need to deeply understand the types and reasons for missing data in our survey before running any analysis. The survey documentation is an important resource for understanding how to deal with missing data. Carefully review the documentation for guidance from the researchers. 4.4 Example: American National Election Studies (ANES) 2020 survey documentation Let’s look at the survey documentation for the American National Election Studies (ANES) 2020. The survey website is located at https://electionstudies.org/data-center/2020-time-series-study/. Navigating to “User Guide and Codebook,” (American National Election Studies 2022) we can download the PDF that contains the survey documentation, titled “ANES 2020 Time Series Study Full Release: User Guide and Codebook”. Do not be daunted by the 796-page PDF. We can focus on the most critical information. Introduction The first section in the User Guide explains that the ANES 2020 Times Series Study continues a series of election surveys conducted since 1948. These surveys contain data on public opinion and voting behavior in the U.S. presidential elections. The introduction also includes information about the modes used for data collection (web, live video interviewing, or CATI). Additionally, there is a summary of the number of pre-election interviews (8,280) and post-election re-interviews (7,449). Sample Design and Respondent Recruitment The section “Sample Design and Respondent Recruitment” provides more detail about how the survey was conducted in that it was a sequential mixed-mode design. This means that all 3 modes were conducted one after another and not at the same time. Additionally, it indicates that for the 2020 survey they resampled all respondents who participated in 2016 ANES, along with a freshly-drawn cross-section: The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. The document continues with more details on the sample groups. Data Analysis, Weights, and Variance Estimation The section “Data Analysis, Weights, and Variance Estimation” includes information on weights and strata/cluster variables. Reading through, we can find the full sample weight variables: For analysis of the complete set of cases using pre-election data only, including all cases and representative of the 2020 electorate, use the full sample pre-election weight, V200010a. For analysis including post-election data for the complete set of participants (i.e., analysis of post-election data only or a combination of pre- and post-election data), use the full sample post-election weight, V200010b. Additional weights are provided for analysis of subsets of the data… The document provides more information about the variables, summarized below: For weight Use variance unit/PSU/cluster and use variance stratum V200010a V200010c V200010d V200010b V200010c V200010d The user guide references a supplemental document called “How to Analyze ANES Survey Data” (DeBell 2010) as a ‘how-to guide’ to help us with our analysis. In the how to guide, we learn more about the weights including that the weights sum to the sample size and not the population. If we want to create estimates at the population level instead of the sample level, we will need to adjust the weights to the population. Let’s recall the “Sample Design and Respondent Recruitment” section: The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 US states or the District of Columbia. To weight to the population, we need to determine the total population size when the survey was conducted. We will use Current Population Survey (CPS) to find a number of the non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or D.C. in November of 2020. The {censusapi} package allows us to run a reproducible analysis of this data. # Note that we need a Census key to access the Census API cps_state_in &lt;- getCensus( name = &quot;cps/basic/nov&quot;, vintage = 2020, region = &quot;state&quot;, vars = c(&quot;HRHHID&quot;, &quot;HRMONTH&quot;, &quot;HRYEAR4&quot;, &quot;PRTAGE&quot;, &quot;PRCITSHP&quot;, &quot;PWSSWGT&quot;), key = Sys.getenv(&quot;CENSUS_KEY&quot;) ) cps_state &lt;- cps_state_in %&gt;% as_tibble() %&gt;% mutate(across(.cols = everything(), .fns = as.numeric)) Once we’ve pulled the data, we want to ensure that the data only includes the 50 U.S. states and D.C. to match the desigred population. cps_state %&gt;% count(state) ## # A tibble: 51 × 2 ## state n ## &lt;dbl&gt; &lt;int&gt; ## 1 1 2406 ## 2 2 1289 ## 3 4 1969 ## 4 5 1988 ## 5 6 9574 ## 6 8 1365 ## 7 9 1157 ## 8 10 1285 ## 9 11 1622 ## 10 12 5055 ## # ℹ 41 more rows Next, we confirm that all the data is from November (HRMONTH == 11) of 2020 (HRYEAR4 == 2020). cps_state %&gt;% count(HRMONTH, HRYEAR4) ## # A tibble: 1 × 3 ## HRMONTH HRYEAR4 n ## &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 11 2020 112037 We then filter to only those who are 18 years or older (PRTAGE &gt;= 18) and have U.S. citizenship (PRCITSHIP %in% (1:4)). targetpop &lt;- cps_state %&gt;% as_tibble() %&gt;% filter(PRTAGE &gt;= 18, PRCITSHP %in% (1:4)) %&gt;% pull(PWSSWGT) %&gt;% sum() targetpop ## [1] 231592693 The target population in 2020 is 231,592,693. This information gives us what we need to create the post-election survey object with {srvyr}. Using the raw ANES data we pulled in at the beginning of this chapter we will adjust the weighting variable (V200010b) using the target population we just calculated (targetpop). anes_in &lt;- anes_raw %&gt;% mutate(Weight = V200010b / sum(V200010b) * targetpop) Once we have the weights adjusted to the population, we can then create the survey design using our new weight variable in the weights argument and use the strata and cluster variables identified in the users manual. anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = V200010d, ids = V200010c, nest = TRUE ) summary(anes_des) ## Stratified 1 - level Cluster Sampling design (with replacement) ## With (101) clusters. ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 5.0e-06 2.7e-05 4.7e-05 7.7e-05 8.3e-05 3.9e-03 ## Stratum Sizes: ## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ## obs 167 148 158 151 147 172 163 159 160 159 137 179 148 160 159 ## design.PSU 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 ## obs 148 158 156 154 144 170 146 165 147 169 165 172 133 157 167 ## design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 ## obs 154 143 143 124 138 130 136 145 140 125 158 146 130 126 126 ## design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 46 47 48 49 50 ## obs 135 133 140 133 130 ## design.PSU 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 ## Data variables: ## [1] &quot;V200010b&quot; &quot;V200010d&quot; ## [3] &quot;V200010c&quot; &quot;V200002&quot; ## [5] &quot;V201006&quot; &quot;V201102&quot; ## [7] &quot;V201101&quot; &quot;V201103&quot; ## [9] &quot;V201025x&quot; &quot;V201231x&quot; ## [11] &quot;V201233&quot; &quot;V201237&quot; ## [13] &quot;V201507x&quot; &quot;V201510&quot; ## [15] &quot;V201549x&quot; &quot;V201600&quot; ## [17] &quot;V201617x&quot; &quot;V202066&quot; ## [19] &quot;V202109x&quot; &quot;V202072&quot; ## [21] &quot;V202073&quot; &quot;V202110x&quot; ## [23] &quot;InterviewMode&quot; &quot;Weight&quot; ## [25] &quot;Stratum&quot; &quot;VarUnit&quot; ## [27] &quot;Age&quot; &quot;AgeGroup&quot; ## [29] &quot;Gender&quot; &quot;RaceEth&quot; ## [31] &quot;PartyID&quot; &quot;Education&quot; ## [33] &quot;Income&quot; &quot;Income7&quot; ## [35] &quot;CampaignInterest&quot; &quot;TrustGovernment&quot; ## [37] &quot;TrustPeople&quot; &quot;VotedPres2016&quot; ## [39] &quot;VotedPres2016_selection&quot; &quot;VotedPres2020&quot; ## [41] &quot;VotedPres2020_selection&quot; &quot;EarlyVote2020&quot; Now that we have the survey design object, we can continue to reference the ANES documentation including the questionnaire and the codebook as we select variables for analysis and gain insights into the findings. 4.5 Searching for public-use survey data Throughout this book we use different public-use datasets from surveys. Above, we provided an example from the American National Election Survey (ANES) and we will continue to use this dataset throughout the book. Additionally, we use the Residential Energy Consumption Survey (RECS), the National Crime Victimization Survey (NCVS), and the AmericasBarometer surveys. As we mentioned in Chapter 2, instead of creating a new survey researchers should look for existing data that can provide insights into their research questions. One of the greatest sources of data is the government. For example, in the U.S., you can get data directly from the various statistical agencies as we have with RECS and NCVS. Other countries often have data available through their official statistics offices such as the Office for National Statistics in the U.K. In addition to government data, many researchers will make their data publicly available through repositories such as the Inter-university Consortium for Political and Social Research (ICPSR) variable search or the Odum Institute Data Archive. Searching these repositories or other compiled lists (e.g., Analyze Survey Data for Free - asdfree.com can be efficient ways to identify surveys with questions related to the researcher’s topic of interest. References "],["c05-descriptive-analysis.html", "Chapter 5 Descriptive analyses in srvyr 5.1 Introduction 5.2 Similarities between {dplyr} and {srvyr} functions 5.3 Deciding on descriptive analyses 5.4 Measures of distribution 5.5 Measures of central tendency 5.6 Measures of dispersion 5.7 Measures of relationship 5.8 Additional topics 5.9 Exercises", " Chapter 5 Descriptive analyses in srvyr TABLE 5.1: Summary of Chapter 5 Topic Descriptive analysis of survey data Purpose purpose-blah Learning Goals learning-goals-blah Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) knitr::opts_chunk$set(tidy = TRUE) To help explain the similarities between {dplyr} functions and {srvyr} functions, this chapter will use the mtcars and iris datasets that are built-in to R and apistrat data that comes in the {survey} package: data(api) dstrata &lt;- apistrat %&gt;% as_survey_design(strata = stype, weights = pw) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 5.1 Introduction Recall from Chapter 3 the general process for estimation with the {srvyr} package: Create a tbl_svy object using srvyr::as_survey_design() or srvyr::as_survey_rep(). Subset the data for subpopulations using srvyr::filter(), if needed. Specify domains of analysis using srvyr::group_by(), if needed. Within srvyr::summarize(), specify variables to calculate means, totals, proportions, quantiles, and more. Filtering should be done after creating the tbl_svy object (using as_survey_design() or as_survey_rep()) because survey objects incorporate the survey design information into the resulting object. 5.2 Similarities between {dplyr} and {srvyr} functions One of the major advantages of using {srvyr} is that it applies {dplyr}-like syntax to the {survey} package. We can use pipes to specify a tbl_svy object, apply a function, and then feed that output into the next function’s first argument. Functions follow the ‘tidy’ convention of snake_case functions names. In the example below, the mean and median are calculated for the variable mpg on the mtcars dataset. mtcars %&gt;% summarize(mpg_mean = mean(mpg), mpg_median = median(mpg)) ## mpg_mean mpg_median ## 1 20.09 19.2 Similarly, in the next example, the variance and standard deviation of the variable api00 are calculated for the tbl_svy object dstrata. Note how similar the syntax is. When we dig into the functions later, we will show that the results output are similar in that one row is output for each group (if there are groups) but there will be more columns output. Specifically, by default, the standard error of the statistic is calculated in addition to the statistic. dstrata %&gt;% summarize(api00_mean = survey_mean(api00), api00_med = survey_median(api00)) ## # A tibble: 1 × 4 ## api00_mean api00_mean_se api00_med api00_med_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 662. 9.54 668 13.7 The functions in {srvyr} also play nicely with other tidyverse functions. If we wanted to select columns that have something in common, we can use {tidyselect} functions such as starts_with(), num_range(), etc.. In the examples below, a combination of across() and starts_with() to calculate the mean of variables starting with “Sepal” in the iris dataframe and then starting with api in the dstrata survey object. iris %&gt;% summarize( across(starts_with(&quot;Sepal&quot;), mean) ) ## Sepal.Length Sepal.Width ## 1 5.843 3.057 dstrata %&gt;% summarize( across(starts_with(&quot;api&quot;), survey_mean) ) ## # A tibble: 1 × 6 ## api00 api00_se api99 api99_se api.stu api.stu_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 662. 9.54 629. 10.1 498. 16.4 We can use {dplyr} verbs such as mutate(), filter(), etc., on our survey object. dstrata_mod &lt;- dstrata %&gt;% mutate(api_diff = api00 - api99) %&gt;% filter(stype==&quot;E&quot;) %&gt;% select(stype, api99, api00, api_diff, api_students=api.stu) dstrata_mod ## Stratified Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - weights: pw ## Data variables: stype (fct), api99 (int), api00 (int), api_diff (int), ## api_students (int) dstrata ## Stratified Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - weights: pw ## Data variables: cds (chr), stype (fct), name (chr), sname (chr), snum ## (dbl), dname (chr), dnum (int), cname (chr), cnum (int), flag (int), ## pcttest (int), api00 (int), api99 (int), target (int), growth (int), ## sch.wide (fct), comp.imp (fct), both (fct), awards (fct), meals ## (int), ell (int), yr.rnd (fct), mobility (int), acs.k3 (int), acs.46 ## (int), acs.core (int), pct.resp (int), not.hsg (int), hsg (int), ## some.col (int), col.grad (int), grad.sch (int), avg.ed (dbl), full ## (int), emer (int), enroll (int), api.stu (int), pw (dbl), fpc (dbl) Instead of data frames or tibbles, {srvyr} functions are meant for tbl_svy objects. Attempting to run data manipulation on non-tbl_svy objects will result in an error as shown in the example below while using the mtcars data frame which is not tbl_svy object. mtcars %&gt;% summarize(mpg_mean = survey_mean(mpg)) ## Error in `summarize()`: ## ℹ In argument: `mpg_mean = survey_mean(mpg)`. ## Caused by error in `cur_svy()`: ## ! Survey context not set A few functions in {srvyr} parallel functions in {dplyr}, such as srvyr::summarize() and srvyr::group_by(). Unlike {srvyr}-specific verbs, the package recognizes these parallel functions on a non-survey object. It will not error and instead give the equivalent output from {dplyr}: mtcars %&gt;% srvyr::summarize(mpg_mean = mean(mpg)) ## mpg_mean ## 1 20.09 Because this book focuses on survey analysis, most of our pipes will stem from a survey object. We will not include the namespace for each function (e.g., srvyr::summarize()). Several functions in {srvyr} must be called within srvyr::summarize() with the exception of srvyr::survey_count() and srvyr::survey_tally() much like dplyr::count() and dplyr::tally() are not called within dplyr::summarize(). These verbs can be used in conjunction with group_by() or by/.by, applying the functions on a group-by-group basis to create grouped summaries. mtcars %&gt;% group_by(cyl) %&gt;% dplyr::summarize(mpg_mean = mean(mpg)) ## # A tibble: 3 × 2 ## cyl mpg_mean ## &lt;dbl&gt; &lt;dbl&gt; ## 1 4 26.7 ## 2 6 19.7 ## 3 8 15.1 We use a similar setup to summarize data in {srvyr}. dstrata %&gt;% group_by(stype) %&gt;% summarize(api00_mean = survey_mean(api00), api00_median = survey_median(api00)) ## # A tibble: 3 × 5 ## stype api00_mean api00_mean_se api00_median api00_median_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 E 674. 12.5 671 20.7 ## 2 H 626. 15.5 635 21.6 ## 3 M 637. 16.6 648 24.1 5.3 Deciding on descriptive analyses We select measures based on the type of variable we are analyzing. Variables are classified as categorical/nominal, ordinal, and interval/ratio. Categorical/nominal data: variables with levels or descriptions that cannot be ordered, such as the region of the country (North, South, East, and West) Ordinal data: variables that can be ordered, such as those from a Likert scale (strongly disagree, disagree, agree, and strongly agree) Interval/ratio: variables that are counted or measured, such as the total cost of electricity Within interval/ratio data are discrete variables, whose values are whole numbers, such as a count of children, and continuous variables, whose values can lie anywhere on an interval, such as weight. If our variable is categorical, such as gender or occupation, we might use frequency counts or percentages. In contrast, if the variable is continuous, such as income or age, we might use mean, median, or standard deviation. Choosing appropriate measures is important to reach valid conclusions. Different variable types have distinct properties and levels of measurement, and we cannot apply all measures to all variables. Our survey data may represent categorical variables using numeric codes. For example, the North, South, East, and West regions of the United States might be coded as 1, 2, 3, and 4, respectively. Though this is a categorical variable, this variable might be automatically read as numeric values when we import our data. This can lead to the common mistake of applying survey_mean() to all numeric columns in the dataset, including categorical values. This practice can lead to incorrect inferences because categorical variables lack a natural zero point or linear ordering, making measures like mean inappropriate. Instead, it is crucial to inspect the codebook, understand the variable type, and choose appropriate measures, such as frequency counts or percentages, to describe the data across regions. Descriptive analysis can be categorized into univariate and multivariate analysis, depending on the number of variables we are analyzing. Below, we describe the descriptive analysis for single or multiple variables and the {srvyr} functions associated with the measures. 5.4 Measures of distribution Measures of distribution describe how often an event or response occurs. These measures include counts, proportions, and totals. Examples: the proportion of students in California who did or did not receive an award based on their Academic Performance score; the estimated number of U.S. citizens who voted in the last election; the total amount of money residential households spend on electricity in a year The {srvyr} package includes several functions for determining measures of distribution and all must be called within the summarize() function except survey_count() and survey_tally(). The survey_count() and survey_tally() functions calculate weighted observations by group The survey_total() function calculates totals The survey_prop() function calculates proportions The survey_quantile() function calculates quantiles 5.4.1 Counts and cross-tabulations With survey_count(), we can calculate the estimated population counts for a given variable or combination of variables. Sometimes, these are referred to as cross-tabulations or crosstabs, for short. These summaries should be applied to categorical data and is used to get estimated counts of the population size of groups from the survey. Syntax The syntax is very similar to the dplyr::count() syntax; however, as noted above, it can only be called on tbl_svy objects. Let’s explore the syntax: survey_count( x, ..., wt = NULL, sort = FALSE, name = &quot;n&quot;, .drop = dplyr::group_by_drop_default(x), vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;) ) survey_tally( x, wt, sort = FALSE, name = &quot;n&quot;, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;) ) The arguments are: x: a tbl_svy object created by as_survey ...: variables to group by, passed to group_by wt: a variable to weight on in addition to the survey weights, defaults to NULL sort: how to sort the variables, defaults to FALSE name: the name of the count variable, defaults to n .drop: whether to drop empty groups vartype: type(s) of variation estimate to calculate, defaults to se (standard error) We will discuss vartype in Section 5.8.2 as this option occurs in all functions. Examples For an example, let’s use the Residential Energy Consumption Survey (RECS), which provides energy consumption and expenditures data. RECS funded by Energy Information Administration and collects information through energy suppliers through in-person, phone, and web interviews. It has been fielded 14 times between 1950 and 2020 and includes questions about appliances, electronics, heating, air conditioning (A/C), temperatures, water heating, lighting, energy bills, respondent demographics, and energy assistance. The survey targets primarily occupied housing units in the US. If we wanted to obtain the estimated number of households in the U.S. (the target population) using the RECS data we could use survey_count(). If we do not specify any variables in the survey_count() function, it will output the estimated population count (n) and standard error (n_se). recs_des %&gt;% survey_count() ## # A tibble: 1 × 2 ## n n_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 Thus, the estimated number of households in the U.S. is 118,208,250.0005889982. To calculate the estimated number of observations for subgroups, such as Region and Division, we can add the variables of interest into the function. In the example below, the estimated number of housing units by region and division is calculated. Additionally, the name of the count variable is changed to “N” from the default (“n”). recs_des %&gt;% survey_count(Region, Division, name = &quot;N&quot;) ## # A tibble: 10 × 4 ## Region Division N N_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast New England 5628844. 0.00642 ## 2 Northeast Middle Atlantic 15377694. 0.000130 ## 3 Midwest East North Central 18094391. 0.000274 ## 4 Midwest West North Central 8277344. 0.000113 ## 5 South South Atlantic 23474851. 0.00555 ## 6 South East South Central 7197189. 0.0240 ## 7 South West South Central 13769934. 0.000423 ## 8 West Mountain North 4246877. 0.000147 ## 9 West Mountain South 4266870. 0.0193 ## 10 West Pacific 17874256. 0.000481 When we run the crosstab, we see there are an estimated 5,628,844.0000229999423 housing units in the New England Division. The survey_tally() function is similar to the survey_count(), but the survey_tally() function lacks the following arguments ... (used to specify the by groups) and .drop. To get the estimated overall population, survey_count() and survey_total() will be identical and the example below yields the same results as using survey_count() previously. recs_des %&gt;% survey_tally() ## # A tibble: 1 × 2 ## n n_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 However, if we wanted the estimated population total by region and division, we will get an error if we try to use the same syntax: recs_des %&gt;% survey_tally(Region, Division, name = &quot;N&quot;) ## Error in `dplyr::summarise()`: ## ℹ In argument: `N = survey_total(Region, vartype = vartype, ## na.rm = TRUE)`. ## Caused by error: ## ! Factor not allowed in survey functions, should be used as a grouping variable. Instead, use a the group_by() function prior to using survey_tally() as is illustrated below: recs_des %&gt;% group_by(Region, Division) %&gt;% survey_tally(name = &quot;N&quot;) ## # A tibble: 10 × 4 ## # Groups: Region [4] ## Region Division N N_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast New England 5628844. 0.00642 ## 2 Northeast Middle Atlantic 15377694. 0.000130 ## 3 Midwest East North Central 18094391. 0.000274 ## 4 Midwest West North Central 8277344. 0.000113 ## 5 South South Atlantic 23474851. 0.00555 ## 6 South East South Central 7197189. 0.0240 ## 7 South West South Central 13769934. 0.000423 ## 8 West Mountain North 4246877. 0.000147 ## 9 West Mountain South 4266870. 0.0193 ## 10 West Pacific 17874256. 0.000481 5.4.2 Totals and sums The survey_total() function is analogous to sum. This can be used to find the estimated aggregate sum of an outcome and should be applied to continuous variables to obtain the estimated total quantity in a population. All the functions introduced henceforth in this chapter must be called from within summarize(). Syntax Let’s explore the syntax: survey_total( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, deff = FALSE, df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples To calculate a population count estimate with survey_total(), the argument x can be left empty as shown in the example below:: recs_des %&gt;% summarize(survey_total()) ## # A tibble: 1 × 2 ## coef `_se` ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 Note that the result from recs_des %&gt;% summarize(survey_total()) is equivalent to the survey_count() call. However, the survey_total() function is called within summarize, where as survey_count(), like dplyr::count(), is not. The difference between survey_total() and survey_count() is more evident when specifying continuous variables to sum. Let’s compute the total cost of electricity in whole dollars from variable DOLLAREL12. We also calculate an unweighted estimate using unweighted(). The unweighted() function calculates unweighted summaries from tbl_svy object which reflects the summary among the respondents and does not extrapolate to a population estimate. recs_des %&gt;% summarize( elec_bill = survey_total(DOLLAREL), elec_unweight = unweighted(sum(DOLLAREL)) ) ## # A tibble: 1 × 3 ## elec_bill elec_bill_se elec_unweight ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 162495237023. 1666895091. 7981888. It is estimated that American residential households spent a total of $162,495,237,022.94540405 on electricity in 2015 and the estimate has a standard error of $1,666,895,091.0112884045. The unweighted function calculates unweighted counts and illustrates the total amount of money the respondents spent on electricity in 2015 which was $7,981,887.75. Since we are using the {srvyr} package, we can use group_by() to calculate the cost of electricity by different groups. Let’s see how much the cost of electricity in whole dollars differed between regions: recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_total(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 28271369286. 717993322. ## 2 Midwest 31527659004. 533527154. ## 3 South 72463508778. 1124624320. ## 4 West 30232699955. 851286976. It’s estimated that households in the Northeast spent $28,271,369,286 on electricity in 2015 while households in the South spent an estimated $72,463,508,778. 5.4.3 Proportions To find estimated proportions in a population, the survey_prop() function should be used. This should be applied to a categorical variable. Syntax survey_prop( vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, proportion = TRUE, prop_method = c(&quot;logit&quot;, &quot;likelihood&quot;, &quot;asin&quot;, &quot;beta&quot;, &quot;mean&quot;, &quot;xlogit&quot;), deff = FALSE, df = NULL ) The arguments are: na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 proportion: an indicator of whether the estimate is a proportion. Defaults to TRUE. Only impacts confidence intervals prop_method: Method to calculate confidence interval for confidence intervals. More details in 5.8.2 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Prior to usage, use the group_by() function to specify the categories of interest. Examples recs_des %&gt;% group_by(Region) %&gt;% summarize(p=survey_prop()) ## When `proportion` is unspecified, `survey_prop()` now defaults to `proportion = TRUE`. ## ℹ This should improve confidence interval coverage. ## This message is displayed once per session. ## # A tibble: 4 × 3 ## Region p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.178 6.84e-11 ## 2 Midwest 0.223 6.07e-11 ## 3 South 0.376 1.43e-10 ## 4 West 0.223 1.37e-10 17.7708% of the households are in the Northeast, 22.3096% in the Midwest, and so on. Note: survey_prop() is essentially the same as using survey_mean() (discussed later) with a categorical variable and without specifying a numeric variable in the x argument. The following code will give us the same results as above: recs_des %&gt;% group_by(Region) %&gt;% summarize(p=survey_mean()) ## # A tibble: 4 × 3 ## Region p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.178 6.84e-11 ## 2 Midwest 0.223 6.07e-11 ## 3 South 0.376 1.43e-10 ## 4 West 0.223 1.37e-10 Getting proportions by more than one variable is possible. In the next example, we look at the proportion of housing units by Region and whether air-conditioning is used (ACUsed).13 recs_des %&gt;% group_by(Region, ACUsed) %&gt;% summarize(p=survey_mean()) ## # A tibble: 8 × 4 ## # Groups: Region [4] ## Region ACUsed p p_se ## &lt;fct&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast FALSE 0.151 0.0200 ## 2 Northeast TRUE 0.849 0.0200 ## 3 Midwest FALSE 0.0766 0.00957 ## 4 Midwest TRUE 0.923 0.00957 ## 5 South FALSE 0.0511 0.00649 ## 6 South TRUE 0.949 0.00649 ## 7 West FALSE 0.301 0.0298 ## 8 West TRUE 0.699 0.0298 When specifying multiple variables, the proportions are conditional. In the results above, notice that the proportions sum to 1 within each region. This can be interpreted as the proportion of housing units that have air conditioning WITHIN each region. If we want the joint proportion instead, the interact function is necessary. In the example below, the interact function is used on Region and ACUsed: recs_des %&gt;% group_by(interact(Region, ACUsed)) %&gt;% summarize(p=survey_mean()) ## # A tibble: 8 × 4 ## Region ACUsed p p_se ## &lt;fct&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast FALSE 0.0268 0.00355 ## 2 Northeast TRUE 0.151 0.00355 ## 3 Midwest FALSE 0.0171 0.00214 ## 4 Midwest TRUE 0.206 0.00214 ## 5 South FALSE 0.0192 0.00244 ## 6 South TRUE 0.357 0.00244 ## 7 West FALSE 0.0672 0.00665 ## 8 West TRUE 0.156 0.00665 5.4.4 Quantiles The survey_quantile() function can be used to find the estimated quantiles of a continuous outcome. For example, we might want estimates of the quartiles of income in a population to understand how the income is distributed. Syntax Let’s explore the syntax: survey_quantile( x, quantiles, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, interval_type = c(&quot;mean&quot;, &quot;beta&quot;, &quot;xlogit&quot;, &quot;asin&quot;, &quot;score&quot;, &quot;quantile&quot;), qrule = c(&quot;math&quot;, &quot;school&quot;, &quot;shahvaish&quot;, &quot;hf1&quot;, &quot;hf2&quot;, &quot;hf3&quot;, &quot;hf4&quot;, &quot;hf5&quot;, &quot;hf6&quot;, &quot;hf7&quot;, &quot;hf8&quot;, &quot;hf9&quot;), df = NULL ) The arguments are: x: a variable, expression, or empty quantiles: A vector of quantiles to calculate na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 interval_type: method for calculating confidence interval. More details in 5.8.2. qrule: rule for defining quantiles. The default is the lower end of the quantile interval (“math”). The midpoint of the quantile interval is the “school” rule. “hf1” to “hf9” are weighted analogues to type=1 to 9 in quantile(). “shahvaish” corresponds to a rule proposed by Shah and Vaish (2006). See vignette(\"qrule\", package=\"survey\") for more information. df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples Quantiles are useful in learning about the distribution of an outcome. Let’s look into the quartiles, specifically, the first quartile (p=0.25), the median (p=0.5) and the third quartile (p=0.75) of electric bills. recs_des %&gt;% summarize(elec_bill = survey_quantile(DOLLAREL, quantiles=c(0.25, .5, 0.75))) ## # A tibble: 1 × 6 ## elec_bill_q25 elec_bill_q50 elec_bill_q75 elec_bill_q25_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 795 1219. 1790. 12.5 ## # ℹ 2 more variables: elec_bill_q50_se &lt;dbl&gt;, elec_bill_q75_se &lt;dbl&gt; In the output above, we see the 3 quartiles and their respective standard errors. We can also estimate the quantiles of electric bills by region as shown below: recs_des %&gt;% group_by(Region) %&gt;% summarize( elec_bill = survey_quantile(DOLLAREL, quantiles=c(0.25, .5, 0.75))) ## # A tibble: 4 × 7 ## Region elec_bill_q25 elec_bill_q50 elec_bill_q75 elec_bill_q25_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 749. 1190. 1741. 19.4 ## 2 Midwest 738. 1067. 1524. 24.2 ## 3 South 1041 1503. 2091. 21.7 ## 4 West 605. 941. 1502. 18.8 ## # ℹ 2 more variables: elec_bill_q50_se &lt;dbl&gt;, elec_bill_q75_se &lt;dbl&gt; While we can specify quantiles of 0 and 1 which represent the minimum and maximum, this is not recommended. It only returns the minimum and maximum of the respondents and cannot be extrapolated to the population as there is no valid definition of standard error. recs_des %&gt;% summarize( elec_bill = survey_quantile(DOLLAREL, quantiles=c(0, 1))) ## # A tibble: 1 × 4 ## elec_bill_q00 elec_bill_q100 elec_bill_q00_se elec_bill_q100_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 21.8 8122. NaN 0 5.5 Measures of central tendency Measures of central tendency find the central (or average) responses. These include the mean, median, and mode. Examples: Average 2000 Academic Performance Index in California, the median house price in Canada The {srvyr} package includes functions for estimating the mean and median, and they must be called within the summarize() function. The survey_mean() function calculates means The survey_median() function calculates medians 5.5.1 Means The survey_mean() function calculate the estimated means of continuous variables of survey data. Syntax Let’s explore the syntax: survey_mean( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, proportion = FALSE, prop_method = c(&quot;logit&quot;, &quot;likelihood&quot;, &quot;asin&quot;, &quot;beta&quot;, &quot;mean&quot;), deff = FALSE, df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 proportion: an indicator of whether the estimate is a proportion. Defaults to FALSE prop_method: Method to calculate confidence interval for confidence intervals. More details in 5.8.2 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples We can calculate the estimated average cost of electricity in the U.S. and then for each region in the U.S.: recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL)) ## # A tibble: 1 × 2 ## elec_bill elec_bill_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 14.1 recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_mean(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 Nationally, the average household spent $1,375 in 2015 with some variability by region. Households from the West spent $1,146 on electricity and in the South, they spent an average of $1,631. 5.5.2 Medians The median is another measure of central tendency which provides an estimate of the midpoint of a continuous distribution. Medians are less subject to outliers than means. Syntax The syntax is nearly identical to survey_quantile() as the median is a special quantile with p=0.5. survey_median( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, interval_type = c(&quot;mean&quot;, &quot;beta&quot;, &quot;xlogit&quot;, &quot;asin&quot;, &quot;score&quot;, &quot;quantile&quot;), qrule = c(&quot;math&quot;, &quot;school&quot;, &quot;shahvaish&quot;, &quot;hf1&quot;, &quot;hf2&quot;, &quot;hf3&quot;, &quot;hf4&quot;, &quot;hf5&quot;, &quot;hf6&quot;, &quot;hf7&quot;, &quot;hf8&quot;, &quot;hf9&quot;), df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 interval_type: method for calculating confidence interval. More details in 5.8.2. qrule: rule for defining quantiles. The default is the lower end of the quantile interval (“math”). The midpoint of the quantile interval is the “school” rule. “hf1” to “hf9” are weighted analogues to type=1 to 9 in quantile(). “shahvaish” corresponds to a rule proposed by Shah and Vaish (2006). See vignette(\"qrule\", package=\"survey\") for more information. df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples We can calculate the estimated median cost of electricity in the U.S. and then for each region in the U.S.: recs_des %&gt;% summarize(elec_bill = survey_median(DOLLAREL)) ## # A tibble: 1 × 2 ## elec_bill elec_bill_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 1219. 17.3 recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_median(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1190. 36.3 ## 2 Midwest 1067. 28.0 ## 3 South 1503. 28.0 ## 4 West 941. 28.8 Nationally, the median household spent $1,219 in 2015 with some variability by region. Households from the West spent $941 on electricity and in the South, they spent an average of $1,503. Note that the 50th percentile and the median are the same, as expected. The average electric bill for households was $1,375 but the estimated median electric bill is $1,219 indicating the distribution is likely right skewed. 5.6 Measures of dispersion Measures of dispersion describe how data spreads around the central tendency for continuous variables. These measures include the standard deviation, variance, and range. Examples: The standard deviation of the 2000 Academic Performance Index in California, the variance of electricity expenditure in Ohio The survey_var() function calculates variances The survey_sd() function calculates standard deviations It should be noted these are estimates of the population variance and population standard deviation. These are not standard errors of another estimate. In our experience, these are sometimes used when designing a future study as understanding the variability in the population can help inform the precision of a future sampling design. 5.6.1 Standard deviation and variance The standard deviation estimate is simply the square root of the variance estimate and thus the functions have the same arguments except the standard deviation does not allow the usage of vartype. Syntax The syntax is as follows: survey_var( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;), level = 0.95, df = NULL ) survey_sd( x, na.rm = FALSE ) The arguments are: x: A variable or expression, or empty na.rm: A logical value to indicate whether missing values should be dropped vartype: Report variability as one or more of: standard error (“se”, default) or variance (“var”) (confidence intervals and coefficient of variation not available). level: (For vartype = “ci” only) A single number or vector of numbers indicating the confidence level. df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples Returning to electricity bills, we look at the amount of variability in electricity expenditure. recs_des %&gt;% summarize( var_elbill = survey_var(DOLLAREL), sd_elbill = survey_sd(DOLLAREL) ) ## Warning: There were 2 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `var_elbill = survey_var(DOLLAREL)`. ## Caused by warning in `thetas - meantheta`: ## ! Recycling array of length 1 in vector-array arithmetic is deprecated. ## Use c() or as.vector() instead. ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. ## # A tibble: 1 × 3 ## var_elbill var_elbill_se sd_elbill ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 647540. 27163. 805. A warning message may be displayed if using a replicate design. Your results are still valid. The results above give an estimate of the population variance of electricity bills (var_elbill), the standard error of that variance (var_elbill_se), and the estimated population standard deviation of electricity bills. Note that there is no standard error associated with the standard deviation - this is the only estimate that does not include a standard error. Like other estimates, we can calculate the variance by region. This would be useful to learn if the variability is similar across regions: recs_des %&gt;% group_by(Region) %&gt;% summarize( var_elbill = survey_var(DOLLAREL), sd_elbill = survey_sd(DOLLAREL) ) ## Warning: There were 8 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `var_elbill = survey_var(DOLLAREL)`. ## ℹ In group 1: `Region = Northeast`. ## Caused by warning in `thetas - meantheta`: ## ! Recycling array of length 1 in vector-array arithmetic is deprecated. ## Use c() or as.vector() instead. ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings. ## # A tibble: 4 × 4 ## Region var_elbill var_elbill_se sd_elbill ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 714269. 54305. 845. ## 2 Midwest 410548. 26708. 641. ## 3 South 682280. 54320. 826. ## 4 West 578803. 39280. 761. 5.7 Measures of relationship Measures of relationship describe how variables relate to each other. In this section, we present two measures - the Pearson’s correlation (simply referred to correlation henceforth) and the ratio. Examples: Correlation between house square footage and electricity expenditure. Ratio of number of diseased teeth to total teeth. The survey_corr() function calculates the Pearson’s correlation between two variables The survey_ratio() function calculates ratio between two variables 5.7.1 Correlations The correlation is a measure of linear relationship between two continuous variables which ranges between -1 and 1. A sample correlation for a simple random sample is calculated as: \\[\\frac{\\sum (x_i-\\bar{x})(y_i-\\bar{y})}{\\sqrt{\\sum (x_i-\\bar{x})^2} \\sqrt{\\sum(y_i-\\bar{y})^2}} \\] When using survey_corr(), for designs other than a simple random sample, the weights are applied when estimating the correlation. Syntax The syntax for survey_corr() is as follows: survey_corr( x, y, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, df = NULL ) The arguments are: x: A variable or expression y: A variable or expression na.rm: A logical value to indicate whether missing values should be dropped vartype: NULL to report no variability. Otherwise one or more of: standard error (“se”, the default), confidence interval (“ci”), variance (“var”) or coefficient of variation (“cv”). level: (For vartype = “ci” only) A single number or vector of numbers indicating the confidence level df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples We can calculate the correlation between total square footage (TOTSQFT_EN)14 and electricity consumption (BTUEL)15. recs_des %&gt;% summarize( SQFT_Elec_Corr=survey_corr(TOTSQFT_EN, BTUEL) ) ## Warning: There was 1 warning in `dplyr::summarise()`. ## ℹ In argument: `SQFT_Elec_Corr = survey_corr(TOTSQFT_EN, BTUEL)`. ## Caused by warning in `sweep()`: ## ! length(STATS) or dim(STATS) do not match dim(x)[MARGIN] ## # A tibble: 1 × 2 ## SQFT_Elec_Corr SQFT_Elec_Corr_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 0.367 0.0172 Like with other statistics, we can do this by another variable. For example, we can examine the correlation by whether air-conditioning is used (ACUsed). recs_des %&gt;% group_by(ACUsed) %&gt;% summarize( SQFT_Elec_Corr=survey_corr(TOTSQFT_EN, DOLLAREL) ) ## Warning: There were 2 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `SQFT_Elec_Corr = survey_corr(TOTSQFT_EN, DOLLAREL)`. ## ℹ In group 1: `ACUsed = FALSE`. ## Caused by warning in `sweep()`: ## ! length(STATS) or dim(STATS) do not match dim(x)[MARGIN] ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. ## # A tibble: 2 × 3 ## ACUsed SQFT_Elec_Corr SQFT_Elec_Corr_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.327 0.0468 ## 2 TRUE 0.384 0.0189 5.7.2 Ratios The ratio estimate is one many are not as familiar with. The ratio is a measure of the ratio of the sum of two variables, specifically in the form of: \\[ \\frac{\\sum x_i}{\\sum y_i}.\\] The ratio is not the same as calculating the following: \\[ \\frac{1}{N} \\sum \\frac{x_i}{y_i} \\] which could be calculated with survey_mean() by creating a derived variable \\(z=x/y\\) and then calculating the mean of \\(z\\). Consider a survey of police agencies in the United States. We might want to estimate the ratio of female police officers to total police officers. This could be done with survey_ratio(Female_Officers, Total_Officers). If instead, we used survey_means(Female_Officers/Total_Officers), we would be estimating the average percentage of female officers across agencies which is a different quantity. Syntax The syntax for survey_corr() is as follows: survey_ratio( numerator, denominator, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, deff = FALSE, df = NULL ) The arguments are: numerator: The numerator of the ratio denominator: The denominator of the ratio na.rm: A logical value to indicate whether missing values should be dropped vartype: Report variability as one or more of: standard error (“se”, default), confidence interval (“ci”), variance (“var”) or coefficient of variation (“cv”). level: A single number or vector of numbers indicating the confidence level deff: A logical value to indicate whether the design effect should be returned df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples Suppose we wanted to find the ratio of dollars spent on liquid propane per unit (in British thermal unit [Btu]) nationally. If we wanted to find the average cost to a household, we could use survey_mean() but to find the national unit rate, we can use ratio. In the following example, we will show both methods and discuss the interpretation of each: recs_des %&gt;% summarize( DOLLARLP_Tot=survey_total(DOLLARLP), BTULP_Tot=survey_total(BTULP), DOL_BTU_Rat=survey_ratio(DOLLARLP, BTULP), DOL_BTU_Avg=survey_mean(DOLLARLP/BTULP, na.rm=TRUE), ) ## # A tibble: 1 × 8 ## DOLLARLP_Tot DOLLARLP_Tot_se BTULP_Tot BTULP_Tot_se DOL_BTU_Rat ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 7759788903. 890352681. 360915832595. 44956961024. 0.0215 ## # ℹ 3 more variables: DOL_BTU_Rat_se &lt;dbl&gt;, DOL_BTU_Avg &lt;dbl&gt;, ## # DOL_BTU_Avg_se &lt;dbl&gt; In the output above, the ratio of the total spent on liquid propane to the total consumption was 0.0215 but the average rate was 0.0215. With a little calculation, it can be shown that the ratio is the ratio of the totals DOLLARLP_Tot/BTULP_Tot=7,759,788,903/360,915,832,595=0.0215. While the ratio could be calculated manually in this manner, the standard error requires usage of the survey_ratio() function. The average can be interpreted as the average rate a household pays. As previously done, we can use group_by() to examine whether this rate varies by region. recs_des %&gt;% group_by(Region) %&gt;% summarize( DOL_BTU_Rat=survey_ratio(DOLLARLP, BTULP), ) ## # A tibble: 4 × 3 ## Region DOL_BTU_Rat DOL_BTU_Rat_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.0263 0.000888 ## 2 Midwest 0.0167 0.000371 ## 3 South 0.0218 0.000986 ## 4 West 0.0232 0.000912 Though not a statistical test, it does appear the cost rates in the Midwest for liquid propane are the lowest. 5.8 Additional topics 5.8.1 Subpopulation analysis Briefly, we mentioned using filter() to subset a survey object for analysis. This operation should be done after creating the design object. In rare circumstances, subsetting data before creating the object can lead to incorrect variability estimates. This can occur if subsetting removes an entire PSU. Suppose, we wanted estimates of the average amount spent on natural gas among housing units that use natural gas. This could be obtained by first filtering records to only include records where BTUNG&gt;0 and then finding the average amount of money spent. recs_des %&gt;% filter(BTUNG&gt;0) %&gt;% summarize( NG_mean=survey_mean(DOLLARNG, vartype = c(&quot;se&quot;, &quot;ci&quot;)) ) ## # A tibble: 1 × 4 ## NG_mean NG_mean_se NG_mean_low NG_mean_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 601. 9.49 582. 620. Note that this yields a higher mean than when not applying the filter. When including housing units that do not use natural gas, many $0 amounts are included in the mean calculation. recs_des %&gt;% summarize( NG_mean=survey_mean(DOLLARNG, vartype = c(&quot;se&quot;, &quot;ci&quot;)) ) ## # A tibble: 1 × 4 ## NG_mean NG_mean_se NG_mean_low NG_mean_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 349. 9.04 331. 367. 5.8.2 Variability types In all of the functions discussed above (except survey_sd()), measures of variability can be calculated. The default variability measure is the standard error (se). There are 4 types of variability measures that can be calculated which are: se: standard error The estimated standard deviation of the estimate Output has a column with the variable name specified in summarize() with a suffix of “_se” ci: confidence interval The lower and upper limits of a confidence interval Output has a column with the variable name specified in summarize() with a suffix of “_low” and “_upp” By default, this is a 95% confidence interval but can be changed by using the argument level and specifying a number between 0 and 1. For example, level=0.8 would produce a 80% confidence interval var: variance The estimated variance of the estimate Output has a column with the variable name specified in summarize() with a suffix of “_var” cv: coefficient of variation A ratio of the standard error and the estimate Output has a column with the variable name specified in summarize() with a suffix of “_cv” Not an option available for survey_var() We will return to the example of calculating the average electricity bill but include all the variability type options. Multiple options can be used at once. recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;))) ## # A tibble: 1 × 6 ## elec_bill elec_bill_se elec_bill_low elec_bill_upp elec_bill_var ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 14.1 1347. 1403. 199. ## # ℹ 1 more variable: elec_bill_cv &lt;dbl&gt; It is also possible to not return any variability estimate by specifying vartype as NULL. recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=NULL)) ## # A tibble: 1 × 1 ## elec_bill ## &lt;dbl&gt; ## 1 1375. In the example below, an 80% confidence interval rather than the deafult 95% confidence interval is calculated recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=&quot;ci&quot;, level=0.8)) ## # A tibble: 1 × 3 ## elec_bill elec_bill_low elec_bill_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 1356. 1393. For means, counts, totals, ratios, correlations, and variances, the confidence intervals are always calculated using a symmetric t-distribution based confidence interval as follows: \\[ \\text{estimate} \\pm t^*_{df}\\times SE\\] where \\(t^*_{df}\\) is the critical value from a t-distribution based on the confidence level and the degrees of freedom. By default the degrees of freedom are calculated based off of the design or number of replicates but they can be specified using the argument df. For survey design objects, the degrees of freedom are calculated as the number of PSUs minus the number of strata. For replicate based objects, the degrees of freedom are calculated as one less than the rank of the matrix of replicate weight where the number of replicates is typically the rank. Note that specifying df=Inf is equivalent to using a normal (z-based) confidence interval. There are more options for calculating confidence intervals for proportions and quantiles (including medians) which are further described. Confidence intervals for proportions The interval above is referred to as a Wald-type interval. While a Wald-type interval using a symmetric t-based confidence interval is an option, this does not generally have the correct coverage rate when sample sizes are small and/or the proportion is “near” 0 or 1. Thus, other methods have been developed to calculate confidence intervals and can be specified using the prop_method option in survey_prop(). The options include: logit: fits a logistic regression model and computes a Wald-type interval on the log-odds scale which is then transformed to the probability scale. This is the default method. likelihood: uses the (Rao-Scott) scaled chi-squared distribution for the log-likelihood from a binomial distribution. asin: uses the variance-stabilizing transformation for the binomial distribution, the arcsine square root, and then back-transforms the interval to the probability scale beta: uses the incomplete beta function with an effective sample size based on the estimated variance of the proportion. mean: the Wald-type interval xlogit: uses a logit transformation of the proportion, calculates a Wald-type interval, and then back-transforms to the probability scale. This method is implemented in SUDAAN and SPSS. In the example below, a logical derived variable is created for whether someone voted for a candidate that was neither Trump nor Biden in the 2020 presidential election. Then, a confidence interval is calculated for that proportion using the various methods. anes_des_sel &lt;- anes_des %&gt;% mutate(VoteOther=(VotedPres2020_selection==&quot;Other&quot;)) %&gt;% group_by(VoteOther) ci_p_vototh &lt;- anes_des_sel %&gt;% summarize( p_logit=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;logit&quot;), p_like=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;likelihood&quot;), p_asin=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;asin&quot;), p_beta=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;beta&quot;), p_mean=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;mean&quot;), p_xlogit=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;xlogit&quot;) ) %&gt;% filter(VoteOther) ci_p_vototh %&gt;% select(ends_with(&quot;low&quot;)) ## # A tibble: 1 × 6 ## p_logit_low p_like_low p_asin_low p_beta_low p_mean_low p_xlogit_low ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.0144 0.0143 0.0142 0.0142 0.0139 0.0144 ci_p_vototh %&gt;% select(ends_with(&quot;upp&quot;)) ## # A tibble: 1 × 6 ## p_logit_upp p_like_upp p_asin_upp p_beta_upp p_mean_upp p_xlogit_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.0230 0.0228 0.0227 0.0230 0.0224 0.0230 Confidence intervals for quantiles Like confidence intervals for proportions, there are several methods for calculating confidence intervals of quantiles which includes medians. The methods for interval types are many of the same as those for proportions (asin, beta, mean, and xlogit) with the addition of two more methods including: score: the Francisco &amp; Fuller confidence interval based on inverting a score test (only available for design-based survey objects and not replicate based objects) quantile: based on the replicates of the quantile. Not valid for jackknife-type replicates but available for bootstrap and BRR replicates. In the example below the confidence interval for the 95th percentile of electricity expenditure is calculated using the various methods. ci_elbill_p90 &lt;- recs_des %&gt;% summarize( p90_asin=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;asin&quot;), p90_beta=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;beta&quot;), p90_mean=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;mean&quot;), p90_xlog=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;xlogit&quot;), p90_quant=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;quantile&quot;) ) ci_elbill_p90 %&gt;% select(ends_with(&quot;low&quot;)) ## # A tibble: 1 × 5 ## p90_asin_q90_low p90_beta_q90_low p90_mean_q90_low p90_xlog_q90_low ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 2331. 2327. 2336. 2327. ## # ℹ 1 more variable: p90_quant_q90_low &lt;dbl&gt; ci_elbill_p90 %&gt;% select(ends_with(&quot;upp&quot;)) ## # A tibble: 1 × 5 ## p90_asin_q90_upp p90_beta_q90_upp p90_mean_q90_upp p90_xlog_q90_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 2470 2470 2473. 2469. ## # ℹ 1 more variable: p90_quant_q90_upp &lt;dbl&gt; In the example below, the mean and score type confidence interval are calculated for the median age of the voting age population. anes_des %&gt;% summarize( am=survey_median(Age, vartype=&quot;ci&quot;, interval_type=&quot;mean&quot;, na.rm=TRUE), as=survey_median(Age, vartype=&quot;ci&quot;, interval_type=&quot;score&quot;, na.rm=TRUE), ) ## # A tibble: 1 × 6 ## am am_low am_upp as as_low as_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 47 46 49 47 40 42 Note that in instances when there are many ties in the data, the score method can produce confidence intervals that do not contain the estimate as is shown above. The documentation in the {survey} package indicates this method has lower performance than the beta and logit intervals. This is the method implemented in SUDAAN though SUDAAN now adds noise to the values to prevent the issue with the ties. 5.8.3 Design effects The design effect measures how the precision of an estimate is impacted by the sampling design. A design effect is calculated as the ratio of the variance of an estimate under the design at hand to the variance of the estimate under a simple random sample without replacement (SRS). A design effect less than 1 indicates that the design is more statisticially efficient than a SRS design. This is rare but possible in a stratified sampling design where the outcome is correlated with the stratification variable(s). A design effect greater than 1 indicates that the design is less statistically effecient than a SRS design. From a design effect, we can calculate the effective sample size as follows: \\[n_{eff}=\\frac{n}{D_{eff}} \\] where \\(n\\) is the nominal sample size (number of survey responses) and \\(D_{eff}\\) is the estimated design effect. The effective sample size has an interesting interpretation that a survey using a SRS design would need a sample size of \\(n_{eff}\\) to obtain the same precision as the design at hand which is where the efficiency interpretation comes in. Design effects are outcome specific - outcomes that are less clustered in the population have smaller design effects than outcomes which are clustered. In the {srvyr} package, design effects can be calculated for totals, proportions, means, and ratio estimates by setting the deff argument to TRUE in the corresponding functions. Example In the example below, the design effect is calculated for the means the consumption of electricity (BTUEL), natural gas (BTUNG), liquid propane (BTULP), fuel oil (BTUFO), wood (BTUWOOD), and wood pellets (BTUPELLET). recs_des %&gt;% summarize( across(c(BTUEL, BTUNG, BTULP, BTUFO, BTUWOOD, BTUPELLET), ~survey_mean(.x, deff=TRUE, vartype=NULL)) ) %&gt;% select(ends_with(&quot;deff&quot;)) ## # A tibble: 1 × 6 ## BTUEL_deff BTUNG_deff BTULP_deff BTUFO_deff BTUWOOD_deff ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1.59 2.39 4.37 1.75 2.17 ## # ℹ 1 more variable: BTUPELLET_deff &lt;dbl&gt; 5.8.4 Creating summary rows When using group_by() in analysis, results are returned with a row for each group or group combination. Often, we want a summary row for the estimate for the entire population. For example, we may want the average electricity consumption by region AND nationally. The {srvyr} package has a function cascade() which adds summary rows for the total of a group. It is used in place of summarize and has some additional features. Syntax The syntax is as follows: cascade( .data, ..., .fill = NA, .fill_level_top = FALSE, .groupings = NULL ) where the arguments are: .data: A tbl_svy object ...: Name-value pairs of summary functions .fill: Value to fill in for group summaries (defaults to NA) .fill_level_top: When filling factor variables, whether to put the value ‘.fill’ in the first position (defaults to FALSE, placing it in the bottom). .groupings: (Experimental) A list of lists of quosures to manually specify the groupings to use, rather than the default. Examples First, let’s take a look at a simple example and then build on it to examine the features of the function. In the first example, all default values are used. recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL)) ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 ## 5 &lt;NA&gt; 1375. 14.1 The last row where Region=NA is the national average electricity bill. We might wish to have a better name for it and can do that using the .fill argument. recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;National&quot;) ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 ## 5 National 1375. 14.1 We can also have more than one grouping variable as follows: recs_des %&gt;% group_by(Region, Urbanicity) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;Total&quot;) %&gt;% ungroup() ## # A tibble: 17 × 4 ## Region Urbanicity DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast Urban Area 1350. 30.8 ## 2 Northeast Urban Cluster 1123. 113. ## 3 Northeast Rural 1473. 68.6 ## 4 Northeast Total 1346. 34.2 ## 5 Midwest Urban Area 1062. 28.6 ## 6 Midwest Urban Cluster 1163. 64.3 ## 7 Midwest Rural 1623. 54.0 ## 8 Midwest Total 1196. 20.2 ## 9 South Urban Area 1581. 34.3 ## 10 South Urban Cluster 1377. 59.7 ## 11 South Rural 1879. 65.8 ## 12 South Total 1631. 25.3 ## 13 West Urban Area 1110. 34.6 ## 14 West Urban Cluster 1134. 69.7 ## 15 West Rural 1363. 83.0 ## 16 West Total 1146. 32.3 ## 17 Total Total 1375. 14.1 We can move the summary row to being the first row: recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;National&quot;, .fill_level_top = TRUE) %&gt;% ungroup() ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 National 1375. 14.1 ## 2 Northeast 1346. 34.2 ## 3 Midwest 1196. 20.2 ## 4 South 1631. 25.3 ## 5 West 1146. 32.3 5.8.5 Calculating estimates for many outcomes Often, we are interested in a summary statistic across many variables. Two useful tools in doing this are the across() function in {dplyr} which has been shown a few times above and the map() function in {purrr}. The across() function allows you to apply the same function to several columns within summarize(). This works well for usage with all functions shown above except survey_prop(). In a later example, we will tackle several proportions. across() Example 1 Suppose, we want to calculate the total consumption for each fuel type and the average consumption for each fuel type with coefficients of variation. These include the consumption of electricity (BTUEL), natural gas (BTUNG), liquid propane (BTULP), fuel oil (BTUFO), wood (BTUWOOD), and wood pellets (BTUPELLET) as illustrated in the discussion on design effects. These are the only variables that start with “BTU” so we can use that to our advantage. consumption_ests &lt;- recs_des %&gt;% summarize( across(starts_with(&quot;BTU&quot;), list(tot=~survey_total(.x, vartype=&quot;cv&quot;), mn=~survey_mean(.x, vartype=&quot;cv&quot;))) ) consumption_ests ## # A tibble: 1 × 24 ## BTUEL_tot BTUEL_tot_cv BTUEL_mn BTUEL_mn_cv BTUNG_tot BTUNG_tot_cv ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 4.32e12 0.0108 36578. 0.0108 3.96e12 0.0255 ## # ℹ 18 more variables: BTUNG_mn &lt;dbl&gt;, BTUNG_mn_cv &lt;dbl&gt;, ## # BTULP_tot &lt;dbl&gt;, BTULP_tot_cv &lt;dbl&gt;, BTULP_mn &lt;dbl&gt;, ## # BTULP_mn_cv &lt;dbl&gt;, BTUFO_tot &lt;dbl&gt;, BTUFO_tot_cv &lt;dbl&gt;, ## # BTUFO_mn &lt;dbl&gt;, BTUFO_mn_cv &lt;dbl&gt;, BTUWOOD_tot &lt;dbl&gt;, ## # BTUWOOD_tot_cv &lt;dbl&gt;, BTUWOOD_mn &lt;dbl&gt;, BTUWOOD_mn_cv &lt;dbl&gt;, ## # BTUPELLET_tot &lt;dbl&gt;, BTUPELLET_tot_cv &lt;dbl&gt;, BTUPELLET_mn &lt;dbl&gt;, ## # BTUPELLET_mn_cv &lt;dbl&gt; In the example above, this results in a very wide table. We may instead want a row for each fuel type. Using the pivot_longer(), separate_wider_delim(), and pivot_wider() functions from {tidyr} can help us get there. We will first make the data much longer. First we make the table longer: consumption_ests_long &lt;- consumption_ests %&gt;% pivot_longer(everything()) consumption_ests_long ## # A tibble: 24 × 2 ## name value ## &lt;chr&gt; &lt;dbl&gt; ## 1 BTUEL_tot 4.32e+12 ## 2 BTUEL_tot_cv 1.08e- 2 ## 3 BTUEL_mn 3.66e+ 4 ## 4 BTUEL_mn_cv 1.08e- 2 ## 5 BTUNG_tot 3.96e+12 ## 6 BTUNG_tot_cv 2.55e- 2 ## 7 BTUNG_mn 3.35e+ 4 ## 8 BTUNG_mn_cv 2.55e- 2 ## 9 BTULP_tot 3.61e+11 ## 10 BTULP_tot_cv 1.25e- 1 ## # ℹ 14 more rows Then we separate out the name column to separate columns for naming later: consumption_ests_sep &lt;- consumption_ests_long %&gt;% separate_wider_delim(name, names=c(&quot;FuelType&quot;, &quot;EstType&quot;, &quot;EstType2&quot;), delim=&quot;_&quot;, too_few = &quot;align_start&quot;) consumption_ests_sep ## # A tibble: 24 × 4 ## FuelType EstType EstType2 value ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 BTUEL tot &lt;NA&gt; 4.32e+12 ## 2 BTUEL tot cv 1.08e- 2 ## 3 BTUEL mn &lt;NA&gt; 3.66e+ 4 ## 4 BTUEL mn cv 1.08e- 2 ## 5 BTUNG tot &lt;NA&gt; 3.96e+12 ## 6 BTUNG tot cv 2.55e- 2 ## 7 BTUNG mn &lt;NA&gt; 3.35e+ 4 ## 8 BTUNG mn cv 2.55e- 2 ## 9 BTULP tot &lt;NA&gt; 3.61e+11 ## 10 BTULP tot cv 1.25e- 1 ## # ℹ 14 more rows Then we create what will become column names and pivot wider to create a table that is almost ready for a publication. A bit more on that will be covered in Chapter 8. consumption_ests_sep %&gt;% mutate(ColName=case_when( EstType==&quot;tot&quot;&amp;is.na(EstType2)~&quot;Total&quot;, EstType==&quot;tot&quot;&amp;!is.na(EstType2)~&quot;Total (CV)&quot;, EstType==&quot;mn&quot;&amp;is.na(EstType2)~&quot;Mean&quot;, EstType==&quot;mn&quot;&amp;!is.na(EstType2)~&quot;Mean (CV)&quot;, )) %&gt;% pivot_wider(id_cols=FuelType, names_from=ColName, values_from = value) %&gt;% mutate(FuelType=str_remove(FuelType, &quot;BTU&quot;)) ## # A tibble: 6 × 5 ## FuelType Total `Total (CV)` Mean `Mean (CV)` ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 EL 4.32e12 0.0108 36578. 0.0108 ## 2 NG 3.96e12 0.0255 33541. 0.0255 ## 3 LP 3.61e11 0.125 3053. 0.125 ## 4 FO 4.64e11 0.0853 3927. 0.0853 ## 5 WOOD 4.92e11 0.0908 4159. 0.0908 ## 6 PELLET 2.11e10 0.242 179. 0.242 across() Example 2 As mentioned earlier, proportions will not work as well directly with the across method. If you wanted the proportion of houses with air conditioning and the proportion of houses with heating, you might imagine needing two group_by() statements as follows: recs_des %&gt;% group_by(ACUsed) %&gt;% summarise(p=survey_prop()) ## # A tibble: 2 × 3 ## ACUsed p p_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.130 0.00826 ## 2 TRUE 0.870 0.00826 recs_des %&gt;% group_by(SpaceHeatingUsed) %&gt;% summarise(p=survey_prop()) ## # A tibble: 2 × 3 ## SpaceHeatingUsed p p_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.0435 0.00370 ## 2 TRUE 0.957 0.00370 If you are only interested in the TRUE outcomes, that is the proportion that do have air conditioning and the proportion that have heating, you can use the fact that survey_mean() applied to a logical variable is the same as using survey_prop() as is shown below: cool_heat_tab &lt;- recs_des %&gt;% summarise(across(c(ACUsed, SpaceHeatingUsed), ~survey_mean(.x))) cool_heat_tab ## # A tibble: 1 × 4 ## ACUsed ACUsed_se SpaceHeatingUsed SpaceHeatingUsed_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.870 0.00826 0.957 0.00370 Note the estimates are the same as when using the separate group_by() statements. Similar to previously done, we can use a combination of pivot_longer() and pivot_wider() to create a table in a format better suited for distribution. cool_heat_tab %&gt;% pivot_longer(everything()) %&gt;% separate_wider_delim(name, names=c(&quot;Comfort&quot;, &quot;EstType&quot;), delim=&quot;_&quot;, too_few = &quot;align_start&quot;) %&gt;% mutate(EstType=if_else(is.na(EstType), &quot;p&quot;, EstType)) %&gt;% pivot_wider(names_from=EstType, values_from=value) ## # A tibble: 2 × 3 ## Comfort p se ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 ACUsed 0.870 0.00826 ## 2 SpaceHeatingUsed 0.957 0.00370 map example If you want to calculate something again and again, loops are a common tool. The {purrr} package has the map() functions which like a loop allows you to do something in the same way many times. In our case, we want to calculate proportions from the same design multiple times. We find an easy way to do this is to think about how you would do it for one outcome, build a function from there, and then iterate. Suppose, we want to create a table that shows the proportion of people that trust in their government (TrustGovernment)16 as well as those that trust in people (TrustPeople)17. In the example below, we create a table that has the variable name as a column, the answer as a column, and then the percentage and its standard error. anes_des %&gt;% drop_na(TrustGovernment) %&gt;% group_by(TrustGovernment) %&gt;% summarise(p=survey_prop()*100) %&gt;% mutate(Variable=&quot;TrustGovernment&quot;) %&gt;% rename(Answer=TrustGovernment) %&gt;% select(Variable, everything()) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 To turn this into a function, we need to use a bit of tidy evaluation which is a more advanced skill. If you want to learn more, we recommend Wickham (2019). calcps &lt;- function(var){ anes_des %&gt;% drop_na(!!sym(var)) %&gt;% group_by(!!sym(var)) %&gt;% summarise(p=survey_prop()*100) %&gt;% mutate(Variable=var) %&gt;% rename(Answer:=!!sym(var)) %&gt;% select(Variable, everything()) } calcps(&quot;TrustGovernment&quot;) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 calcps(&quot;TrustPeople&quot;) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustPeople Always 0.809 0.164 ## 2 TrustPeople Most of the time 41.4 0.857 ## 3 TrustPeople About half the time 28.2 0.776 ## 4 TrustPeople Some of the time 24.5 0.670 ## 5 TrustPeople Never 5.05 0.422 Now, we can use map to iterate over as many variables as we want. It will output a data.frame with the variable name in the column “Variable”, the responses in “Answer”, the percentage and then the standard error. Finally, we can now use map to do this iteratively. This example extends nicely if you have many variables for which you want the percentage estimate. c(&quot;TrustGovernment&quot;, &quot;TrustPeople&quot;) %&gt;% map(calcps) %&gt;% list_rbind() ## # A tibble: 10 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 ## 6 TrustPeople Always 0.809 0.164 ## 7 TrustPeople Most of the time 41.4 0.857 ## 8 TrustPeople About half the time 28.2 0.776 ## 9 TrustPeople Some of the time 24.5 0.670 ## 10 TrustPeople Never 5.05 0.422 5.9 Exercises The first set of exercises should use the design object anes_des created in the book earlier. How many females have a graduate degree? Hint: the variables Gender and Education will be useful. What percentage of people identify as “Strong Democrat”? Hint: The variable PartyID indicates what party people identify with. What percentage of people who voted in the 2020 election identify as “Strong Republican”? Hint: The variable VotedPres2020 indicates whether someone voted in 2020. What percentage of people voted in both the 2016 election and in the 2020 election? Include the logit confidence interval. Hint: The variable VotedPres2016 indicates whether someone voted in 2016. What is the design effect for the proportion of people who voted early? Hint: The variable EarlyVote2020 indicates whether someone voted early in 2020. What is the average temperature that people set their thermostats to at night during the winter? Hint: The variable WinterTempNight indicates the temperature that people set their temperature in the winter at night. People do not always set their temperature the same over different seasons and during the day. What are the median temperatures that people set their thermostat to in the summer and winter both during the day and during the night? Include confidence intervals. Hint: Use the variables WinterTempDay, WinterTempNight, SummerTempDay, and SummerTempNight. What is the correlation between the temperature that people set their temperature at during the night and during the day in the summer? What is the 1st, 2nd, and 3rd quartile of the amount of money spent on energy by Building America (BA) climate zone? Hint: TOTALDOL indicates the total amount spent on electricity and ClimateRegion_BA indicates the BA climate zones References "],["c06-statistical-testing.html", "Chapter 6 Statistical testing 6.1 Introduction 6.2 Comparison of Proportions and Means 6.3 Chi-Square Tests 6.4 Exercises", " Chapter 6 Statistical testing Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(gt) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 6.1 Introduction When analyzing results from a survey, the point estimates described in Chapter 5 help us understand the data at a high level. Still, researchers and the public often want to make comparisons between different groups. These comparisons are calculated through statistical testing. The general idea of statistical testing is the same for data obtained through surveys and data obtained through other methods, where we compare the point estimates and variance estimates of each statistic to see if statistically significant differences exist. However, statistical testing for complex surveys involves additional considerations due to the need to account for the sampling design in order to obtain accurate variance estimates. The functions in the {survey} packages allow for the correct estimation of the variances. This chapter will cover the following statistical tests with survey data and functions: Comparison of proportions svyttest() Comparison of means svyttest() Goodness of fit tests svygofchisq() Tests of independence svychisq() Tests of homogeneity svychisq() Up to this point, we’ve shown functions that use wrappers from the {srvyr} package. This means that the functions work with tidyverse syntax. However, the functions in this chapter do not have wrappers in the {srvyr} package and are instead used directly from the {survey} package. Therefore, the design object is not the first argument and to use these functions with the magrittr pipe %&gt;% and tidyverse syntax, we will need to use dot (.) notation. Functions that work with the magrittr pipe %&gt;% have the data as the first argument. When we run a function with the pipe, it automatically places anything to the left of the pipe into the first argument of the function to the right of the pipe. For example, if we wanted to take the mtcars data and filter to cars with six cylinders, we can write the code in at least four different ways: filter(mtcars, cyl == 6) mtcars %&gt;% filter(cyl == 6) mtcars %&gt;% filter(., cyl == 6) mtcars %&gt;% filter(.data = ., cyl == 6) Each of these lines of code will produce the same output since the argument that takes the data is in the first spot in filter(). The first two are probably familiar to those who have worked with the tidyverse. The third option functions the same way as the second one but is explicit that mtcars goes into the first argument, and the fourth option indicates that mtcars is going into the named argument of .data. Here we’re telling R to take what’s on the left side of the pipe (mtcars) and pipe it into the spot with the dot (.)—the first argument. In functions that are not part of the tidyverse, the data argument may not be in the first spot. For example, in svyttest(), the data argument is in the second spot, which means we need to place the dot (.) in the second spot and not the first. For example: svydata_des %&gt;% svyttest(x ~ y, .) By default, the pipe places the left-hand object in the first argument spot. Placing the dot (.) in the second argument spot indicates that the survey design object svydata_des should be used in the second argument and not the first. Alternatively, named arguments could be used to place the dot first, as in the following: svydata_des %&gt;% svyttest(design = ., x ~ y) 6.2 Comparison of Proportions and Means We use t-tests to compare two proportions or means. T-tests allow us to determine if one proportion or mean is statistically different from the other. They are commonly used to determine if a single estimate differs from a known value (e.g., 0 or 50%) or to compare two group means (e.g., North versus South). Comparing a single estimate to a known value is called a one sample t-test, and we can set up the hypothesis test as follows: \\(H_0: \\mu = 0\\) where \\(\\mu\\) is the mean outcome and \\(0\\) is the value we are comparing it to \\(H_A: \\mu \\neq 0\\) For comparing two estimates, this is called a two-sample t-test and we can set up the hypothesis test as follows: \\(H_0: \\mu_1 = \\mu_2\\) where \\(\\mu_i\\) is the mean outcome for group \\(i\\) \\(H_A: \\mu_1 \\neq \\mu_2\\) Two sample t-tests can also be paired or unpaired. If the data come from two different populations (e.g., North versus South), the t-test run will be an unpaired or independent samples t-test. Paired t-tests occur when the data come from the same population. This is commonly seen with data from the same population in two different time periods (e.g., before and after an intervention). The difference between t-tests with non-survey data and survey data is based on the underlying variance estimation difference. Chapter 3 provides a detailed overview of the math behind the mean and sampling error calculations for various sample designs. The functions in the {survey} package will account for these nuances, provided the design object is correctly defined. 6.2.1 Syntax When we do not have survey data, we can use the t.test() function from the {stats} package. This function does not allow for weights or the variance structure that need to be accounted for with survey data. Therefore, we need to use the svyttest() function from {survey} when using survey data. Many of the arguments are the same between the two functions, but there are two key differences: We need to use the survey design object instead of the original data frame We can only use a formula and not separate x and y data Here is the syntax for the svyttest() function: svyttest(formula, design, na.rm = FALSE, level = 0.95, ...) Notice that the first argument here is the formula and not the design. This means we must use the dot (.) if we pipe in the survey design object (as described at the beginning of this chapter). The formula argument can take on a couple of different forms depending on what we are measuring. Here are a few common scenarios: One-sample t-test: Comparison to 0: var ~ 0, where var is the measure of interest, and 0 is the value we compare it to. For example, we could test if the proportion of the population that has blue eyes is different from 0. Comparison to a different value: I(var - value) ~ 0, where var is the measure of interest and value is what we are comparing to. We need to use the I() function to tell the program to calculate the difference between the variable and the comparison value before testing. For example, we could test if the proportion of the population that has blue eyes is different from 25% by using I(var - 0.25) ~ 0. Two-sample t-test: Unpaired: 2 level grouping variable: var ~ groupVar, where var is the measure of interest and groupVar is a variable with two categories. For example, we could test if the proportion of the population that has blue eyes is different for children aged 5-10 years old compared to children under 5 years old. 3+ level grouping variable: var ~ I(groupVar == level), where var is the measure of interest, groupVar is the categorical variable, and level is the category level to isolate. Again, we need to use the I() function to tell the program to isolate the category before doing the comparison across groups. For example, we could test if the test scores in one classroom differed from all other classrooms. Paired: I(var_1 - var_2) ~ 0, where var_1 is the first variable of interest and var_2 is the second variable of interest. We again will have to use the I() function to have the program calculate the difference between the two variables before comparing it against 0. For example, we could test if test scores on a subject differed between the start and the end of a course. In R, I() is a special function that isolates its content from R’s parsing code. It is often referred to as the “as-is” operator. When we wrap an expression inside I(), we can include variables or expressions that should be taken “as-is”, without any variable name expansion or other interpretation, allowing the standard R operators to work as they would if we used them outside of a formula. Additionally, the na.rm argument defaults to FALSE, which means if any data is missing, the t-test will not compute. Throughout this chapter we will always set na.rm = TRUE, but before analyzing the survey data, review the notes provided in Chapter 4 to better understand how to handle missing data. Finally, the level argument is \\(1-\\alpha\\), or the amount of type 1 error. The default is \\(0.95\\). Let’s walk through a few examples using the ANES and RECS data. 6.2.2 Examples Example 1: One-sample t-test RECS asks respondents to indicate what temperature they set their house to during the summer at night. In our data, we’ve called this variable SummerTempNight. If we want to see if the U.S. household sets their temperature at a value different from 68\\(^\\circ\\)F, we could set up the hypothesis as follows: \\(H_0: \\mu = 68\\) where \\(\\mu\\) is the average temperature U.S. households set their thermostat to in the summer at night \\(H_A: \\mu \\neq 68\\) To conduct this in R, we use svyttest() with I() function in the formula argument: ttest_ex1 &lt;- recs_des %&gt;% svyttest( formula = I(SummerTempNight - 68) ~ 0, design = ., na.rm = TRUE ) ttest_ex1 ## ## Design-based one-sample t-test ## ## data: I(SummerTempNight - 68) ~ 0 ## t = 41, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 3.425 3.773 ## sample estimates: ## mean ## 3.599 To pull out specific output, we can use R’s built-in $ operator. For instance, to obtain the estimate \\(\\mu - 68\\), we run ttest_ex1$estimate. If we want the average, we take our t-test estimate and add it to 68: ttest_ex1$estimate + 68 ## mean ## 71.6 Or, we can do the following: recs_des %&gt;% summarize(mu = survey_mean(SummerTempNight, na.rm = TRUE)) ## # A tibble: 1 × 2 ## mu mu_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 71.6 0.0878 The result is the same in both methods, so we see that the average temperature U.S. households set their thermostat to in the summer at night is 71.6\\(F. Looking at the output from `svyttest()`, the t-statistic is 41, and the p-value is 8.67\\times 10^{-62}, indicating that the average is statistically different from 68\\)$F at an \\(\\alpha\\) level of \\(0.05\\). Example 2: One-sample t-test ANES asked respondents if they voted for president in the 2020 election. In our data, we call this variable VotedPres2020. Let’s look at the proportion of the U.S. voting-eligible population that voted for president in 2020 using the survey_prop() function we learned in Chapter 5. voteprop &lt;- anes_des %&gt;% group_by(VotedPres2020) %&gt;% summarize(p = survey_prop()) voteprop ## # A tibble: 3 × 3 ## VotedPres2020 p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Yes 0.772 0.00757 ## 2 No 0.227 0.00763 ## 3 &lt;NA&gt; 0.00113 0.000464 Based on this, 77.2% of the U.S. voting-eligible population voted for president in 2020. If we wanted to know how this compares to another country, we could use svyttest(). For example, if we know that the voter turnout in Germany in the 2017 general election was 76.2%, we could set up our hypothesis as follows: \\(H_0: p = 0.762\\) where \\(p\\) is the proportion of the U.S. voting-eligible population that voted for president in 2020 \\(H_A: p \\neq 0.762\\) To conduct this in R, we use the svyttest() function. Note that because VotedPres2020 is a factor, we need to specify our desired level within I() before nesting it in another I() to conduct the t-test. In this case, we want to isolate those who voted for president in 2020. ttest_ex2 &lt;- anes_des %&gt;% svyttest(formula = I(I(VotedPres2020 == &quot;Yes&quot;) - 0.762) ~ 0, design = ., na.rm = TRUE) ttest_ex2 ## ## Design-based one-sample t-test ## ## data: I(I(VotedPres2020 == &quot;Yes&quot;) - 0.762) ~ 0 ## t = 1.4, df = 50, p-value = 0.2 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## -0.004729 0.025899 ## sample estimates: ## mean ## 0.01058 The output from the svyttest() function can be a bit hard to read. Using the {broom} package from tidymodels, a collection of packages for modeling using the tidyverse principles, we can clean up the output into a tibble to more easily understand what the test is telling us. broom::tidy(ttest_ex2) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 0.0106 1.39 0.171 50 -0.00473 0.0259 Design-based … ## # ℹ 1 more variable: alternative &lt;chr&gt; The estimate differs from example one in that the estimate is not displaying \\(\\mu - 0.762\\) but rather \\(\\mu\\), or the difference between the U.S. proportion and the German proportion we are comparing to. We can see that there is a difference of 360 percentage points. Additionally, the t-statistic value in the statistic column is 41, and the p-value is 8.67^{-62}. These results indicate that the U.S. and Germany have similar voter turnout. Example 3: Unpaired two-sample t-test Two additional variables in the RECS data are the electric bill cost (DOLLAREL) and whether the house used AC or not (ACUsed). If we want to know if the U.S. households that used AC had higher electrical bills compared to those that did not, we could set up the hypothesis as follows: \\(H_0: \\mu_{AC} = \\mu_{noAC}\\) where \\(\\mu_{AC}\\) is the electrical bill cost for U.S. households that used AC and \\(\\mu_{noAC}\\) is the electrical bill cost for U.S. households that did not use AC \\(H_A: \\mu_{AC} \\neq \\mu_{noAC}\\) Let’s take a quick look at the data to see the format the data are in: recs_des %&gt;% group_by(ACUsed) %&gt;% summarize(mean = survey_mean(DOLLAREL, na.rm = TRUE)) ## # A tibble: 2 × 3 ## ACUsed mean mean_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 972. 25.8 ## 2 TRUE 1435. 15.8 To conduct this in R, we use svyttest(): ttest_ex3 &lt;- recs_des %&gt;% svyttest(formula = DOLLAREL ~ ACUsed, design = ., na.rm = TRUE) broom::tidy(ttest_ex3) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 463. 14.8 3.08e-26 94 401. 525. Design-based… ## # ℹ 1 more variable: alternative &lt;chr&gt; The results indicate that the difference in electrical bills for those that used AC and those that did not is, on average, $462.87. The difference appears to be statistically significant as the t-statistic is 14.772 and the p-value is 3.08^{-26}. Households that used AC spent on average $462.90 more in 2015. Example 4: Paired two-sample t-test To conduct a paired t-test that looks at differences at two time points, we use the same I() notation we’ve been using. For example, let’s say we want to test whether the temperature that U.S. households set their thermostat to differs depending on the season (comparing summer and winter temperatures). We could set up the hypothesis as follows: \\(H_0: \\mu_{summer} = \\mu_{winter}\\) where \\(\\mu_{summer}\\) is the temperature that U.S. households set their thermostat to during summer nights, and \\(\\mu_{winter}\\) is the temperature that U.S. households set their thermostat to during winter nights \\(H_A: \\mu_{summer} \\neq \\mu_{winter}\\) To conduct this in R, we use svyttest() and I(): ttest_ex4 &lt;- recs_des %&gt;% svyttest( design = ., formula = I(SummerTempNight - WinterTempNight) ~ 0, na.rm = TRUE ) broom::tidy(ttest_ex4) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 3.21 29.1 8.83e-49 94 3.00 3.43 Design-based… ## # ℹ 1 more variable: alternative &lt;chr&gt; U.S. households set their thermostat on average 3.2\\(^\\circ\\)F warmer in summer nights than winter nights, which is statistically significant (t = 29.1, p-value = 8.83^{-49}). 6.3 Chi-Square Tests Chi-square tests (\\(\\chi^2\\)) allow us to examine multiple proportions using a goodness-of-fit test, a test of independence, or a test of homogeneity. All three of these tests have the same \\(\\chi^2\\) distributions but with slightly different underlying assumptions. First, goodness-of-fit tests are used when comparing observed data to expected data. For example, this could be used to determine if respondent demographics (the observed data) match known population information (the expected data). In this case, we can set up the hypothesis test as follows: \\(H_0: p_1 = \\pi_1, ~ p_2 = \\pi_2, ~ ..., ~ p_k = \\pi_k\\) where \\(p_i\\) is the observed proportion for category \\(i\\), \\(\\pi_i\\) is expected proportion for category \\(i\\), and \\(k\\) is the number of categories \\(H_A:\\) at least one level of \\(p_i\\) does not match \\(\\pi_i\\) Second, tests of independence are used when comparing two types of observed data to see if there is a relationship. For example, this could be used to determine if the proportion of respondents who voted for each political party in the presidential election matches the proportion of respondents who voted for each political party in a local election. In this case, we can set up the hypothesis test as follows: \\(H_0:\\) The two variables/factors are independent \\(H_A:\\) The two variables/factors are not independent Third, tests of homogeneity are used to compare two distributions to see if they match. For example, this could be used to determine if the highest education achieved is the same for both men and women. In this case, we can set up the hypothesis test as follows: \\(H_0: p_{1a} = p_{1b}, ~ p_{2a} = p_{2b}, ~ ..., ~ p_{ka} = p_{kb}\\) where \\(p_{ia}\\) is the observed proportion of category \\(i\\) for subgroup \\(a\\), \\(p_{ib}\\) is the observed proportion of category \\(i\\) for subgroup \\(a\\) and \\(k\\) is the number of categories \\(H_A:\\) at least one category of \\(p_{ia}\\) does not match \\(p_{ib}\\) As with t-tests, the difference between using \\(\\chi^2\\) tests with non-survey data and survey data is based on the underlying variance estimation. The functions in the {survey} package will account for these nuances, provided the design object is correctly defined. For basic variance estimation formulas for different survey design types, refer to Chapter 3. 6.3.1 Syntax When we do not have survey data, we may be able to use the chisq.test() function. However, this function does not allow for weights or the variance structure to be accounted for with survey data. Therefore, when using survey data, we need to use one of two functions: svygofchisq(): For goodness of fit tests svychisq(): For tests of independence and homogeneity The non-survey data function of chisq.test() requires either a single set of counts and given proportions (for goodness of fit tests) or two sets of counts for tests of independence and homogeneity. The functions we use with survey data require respondent-level data and formulas instead of counts. This ensures that the variances are correctly calculated. First, the function for the goodness of fit tests is svygofchisq(): svygofchisq(formula, p, design, na.rm = TRUE, ...) In this function, the first argument is the formula, the second is p, which is the expected proportions, and the third is the design. Therefore, we again must use the dot (.) notation if we pipe in the survey design object or explicitly name the arguments (as described at the beginning of this chapter). For the goodness of fit tests, the formula will be a single variable formula = ~var as we compare the observed data from this variable to the expected data. The expected probabilities are then entered in the p argument and need to be a vector of the same length as the number of categories in the variable. For example, if we want to know if the proportion of males and females match a distribution of 30/70, then the sex variable (with two categories) would be used formula = ~SEX, and the proportions would be included as p = c(.3, .7). It is important to note that the variable entered into the formula should be formatted as either a factor or a character. The examples below provide more detail and tips on how to make sure the levels match up correctly. The function for tests of independence and homogeneity (svychisq()) is similar to the goodness of fit function in that the formula argument is first. However, instead of an argument for the expected proportions, the svychisq() function has an argument to select the statistic(s) used for the test: svychisq( formula, design, statistic = c(&quot;F&quot;, &quot;Chisq&quot;, &quot;Wald&quot;, &quot;adjWald&quot;, &quot;lincom&quot;, &quot;saddlepoint&quot;), na.rm = TRUE, ... ) There are six statistics that are accepted in this formula. For tests of homogeneity (when comparing cross-tabulations), the F or Chisq statistics should be used.18 The F statistic is the default and uses the Rao-Scott second-order correction. This correction is designed to assist with complicated sampling designs (i.e., those other than a simple random sample) (CITE)19. The Chisq statistic is an adjusted version of the Pearson \\(\\chi^2\\) statistic. The version of this statistic in the svychisq() function compares the design effect estimate from the provided survey data to what the \\(\\chi^2\\) distribution would have been if the data came from a simple random sampling. For tests of independence, the Wald and adjWald are recommended as they provide a better adjustment for variable comparisons (Lumley (2010)). If the data has a small number of primary sampling units (PSUs) compared to the degrees of freedom, then the adjWald statistic should be used to account for this. The lincom and saddlepoint statistics are available for more complicated data structures. The formula argument will always be one-sided, unlike the svyttest() function. The two variables of interest should be included with a plus sign: formula = ~ var_1 + var_2. As with the svygofchisq() function, the variables entered into the formula should be formatted as either a factor or a character. Additionally, as with the t-test function, both svygofchisq() and svychisq() have the na.rm argument. This argument defaults to FALSE; however, unlike the t-test function, if any data is missing, the \\(\\chi^2\\) tests will assume that NA is a category and will include it in the calculation. Throughout this chapter, we will always set na.rm = TRUE, but before analyzing the survey data, review the notes provided in Chapter 4 to better understand how to handle missing data. 6.3.2 Examples Let’s walk through a few examples using the ANES data. Example 1: Goodness of Fit Test ANES asked respondents about their highest education level. Based on the data from the 2020 American Community Survey (ACS) 5-year estimates20, the education distribution of those 18+ in the U.S. is as follows: - 11% had less than High School degree - 27% had a High School degree - 29% had some college or associate’s degree - 33% had a bachelor’s degree or higher If we want to see if the weighted distribution from the ANES 2020 data matches this distribution, we could set up the hypothesis as follows: \\(H_0: p_1 = 0.11, ~ p_2 = 0.27, ~ p_3 = 0.29, ~ p_4 = 0.33\\) \\(H_A:\\) at least one of the education levels does not match between the ANES and the ACS To conduct this in R, let’s first look at the education variable (Education) we have on the ANES data. Using the survey_mean() function discussed in Chapter 5, we can see the education levels and estimated proportions. anes_des %&gt;% group_by(Education) %&gt;% filter(!is.na(Education)) %&gt;% summarize(p = survey_mean()) ## # A tibble: 5 × 3 ## Education p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than HS 0.0805 0.00568 ## 2 High school 0.277 0.0102 ## 3 Post HS 0.290 0.00713 ## 4 Bachelor&#39;s 0.226 0.00633 ## 5 Graduate 0.126 0.00499 Based on this output, we can see that we have different levels than the ACS data provides. Specifically, the education data from ANES has two levels for Bachelor’s Degree or Higher (Bachelor’s and Graduate), so these two categories need to be collapsed into a single category to match the ACS data. For this, we can use the {forcats} package from the tidyverse. The package’s fct_collapse() function creates a new variable. Then we will use the svygofchisq() function to compare the ANES data to the ACS data: anes_des_educ &lt;- anes_des %&gt;% mutate(Education2 = fct_collapse(Education, &quot;Bachelor or Higher&quot; = c(&quot;Bachelor&#39;s&quot;, &quot;Graduate&quot;))) chi_ex1 &lt;- anes_des_educ %&gt;% svygofchisq( formula = ~ Education2, p = c(0.11, 0.27, 0.29, 0.33), design = ., na.rm = TRUE ) chi_ex1 ## ## Design-based chi-squared test for given probabilities ## ## data: ~Education2 ## X-squared = 2177472, scale = 1.1e+05, df = 2.3e+00, p-value = ## 9e-05 The output from the svygofchisq() indicates that at least one proportion from ANES does not match the ACS data ( \\(\\chi^2=\\) 2.1775^{6}; p-value= 8.74^{-5} ). To get a better idea of the differences, we can use the expected output along with survey_mean() to create a comparison table: ex1_expected &lt;- tibble(ExpectedCount = chi_ex1$expected) %&gt;% mutate( Education = names(chi_ex1$expected), Education = str_sub(Education, 11, nchar(Education)), ExpectedProb = ExpectedCount / sum(ExpectedCount) ) %&gt;% select(Education, Expected = ExpectedProb) ex1_observed &lt;- anes_des_educ %&gt;% filter(!is.na(Education2)) %&gt;% group_by(Education2) %&gt;% summarize(Observed = survey_mean(vartype = &quot;ci&quot;)) %&gt;% rename(Education = Education2) ex1_table &lt;- ex1_expected %&gt;% left_join(ex1_observed, by = &quot;Education&quot;) %&gt;% mutate(Education = factor( Education, levels = c(&quot;Less than HS&quot;, &quot;High school&quot;, &quot;Post HS&quot;, &quot;Bachelor or Higher&quot;) )) ex1_table ## # A tibble: 4 × 5 ## Education Expected Observed Observed_low Observed_upp ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than HS 0.11 0.0805 0.0691 0.0919 ## 2 High school 0.27 0.277 0.257 0.298 ## 3 Post HS 0.29 0.290 0.276 0.305 ## 4 Bachelor or Higher 0.33 0.352 0.337 0.367 This output includes our expected proportions from the ACS that we provided the svygofchisq() function along with the output of the observed proportions and their confidence intervals. From this table, we can see that the “High school” and “Post HS” categories have nearly identical proportions but that the other two categories are slightly different. Looking at the confidence intervals, we can see that the ANES data skews to include fewer people in the “Less than HS” category and more people in the “Bachelor or Higher” category. This may be easier to see in graphical form: ex1_table %&gt;% pivot_longer( cols = c(&quot;Expected&quot;, &quot;Observed&quot;), names_to = &quot;Names&quot;, values_to = &quot;Proportion&quot; ) %&gt;% mutate( Observed_low = case_when(Names == &quot;Observed&quot; ~ Observed_low), Observed_upp = case_when(Names == &quot;Observed&quot; ~ Observed_upp) ) %&gt;% ggplot(aes(x = Education, y = Proportion, color = Names)) + geom_point(alpha = 0.5) + geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) + theme_bw() + scale_color_manual(name = &quot;Type&quot;, values = book_colors[c(4,1)]) + theme(legend.position = &quot;bottom&quot;) FIGURE 6.1: Expected and observed proportions of education, showing the confidence intervals for the expected proportions and whether the observed proportions lie within them. The x-axis has labels ‘Less than HS’, ‘High school’, ‘Post HS’, and ‘Bachelor or Higher’. The only ones where expected proportion is outside of the intervals is ‘Less than HS’ and ‘Bachelor or Higher’. Example 2: Test of Independence ANES asked respondents two questions about trust: How often can you trust the federal government to do what is right? How often can you trust other people? If we want to see if the distributions of these two questions are similar or not, we can conduct a test of independence. Here is how the hypothesis could be set up: \\(H_0:\\) People’s trust in the federal government and their trust in other people are independent (i.e., not related) \\(H_A:\\) People’s trust in the federal government and their trust in other people are not independent (i.e., they are related) To conduct this in R, we use the svychisq() function to compare the two variables: chi_ex2 &lt;- anes_des %&gt;% svychisq( formula = ~ TrustGovernment + TrustPeople, design = ., statistic = &quot;Wald&quot;, na.rm = TRUE ) chi_ex2 ## ## Design-based Wald test of association ## ## data: NextMethod() ## F = 21, ndf = 16, ddf = 51, p-value &lt;2e-16 The output from svychisq() indicates that the distribution of people’s trust in the federal government and their trust in other people are not independent, meaning that they are related. Let’s output the distributions in a table to see the relationship. The observed output from the test provides a cross-tabulation of the counts for each category: chi_ex2$observed ## TrustPeople ## TrustGovernment Always Most of the time About half the time ## Always 16.470 25.009 31.848 ## Most of the time 11.020 539.377 196.258 ## About half the time 11.772 934.858 861.971 ## Some of the time 17.007 1353.779 839.863 ## Never 3.174 236.785 174.272 ## TrustPeople ## TrustGovernment Some of the time Never ## Always 36.854 5.523 ## Most of the time 206.556 27.184 ## About half the time 428.871 65.024 ## Some of the time 932.628 89.596 ## Never 217.994 189.307 However, as researchers, we often want to know about the proportions and not just the respondent counts from the survey. There are a couple of different ways that we can do this. The first is using the counts from chi_ex2$observed to calculate the proportion. We can then pivot the table to create a cross-tabulation similar to the counts table above. Adding group_by() to the code means that we are obtaining the proportions within each level of that variable. In this case, we are looking at the distribution of TrustGovernment for each level of TrustPeople. chi_ex2$observed %&gt;% as_tibble() %&gt;% group_by(TrustPeople) %&gt;% mutate(prop = round(n / sum(n), 3)) %&gt;% select(-n) %&gt;% pivot_wider(names_from = TrustPeople, values_from = prop) %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% tab_stubhead(label = &quot;Trust in Government&quot;) %&gt;% tab_spanner(label = &quot;Trust in People&quot;, columns = everything()) %&gt;% cols_label(`Most of the time` = md(&quot;Most of&lt;br /&gt;the time&quot;), `About half the time` = md(&quot;About half&lt;br /&gt;the time&quot;), `Some of the time` = md(&quot;Some of&lt;br /&gt;the time&quot;)) #edxahdlkim table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #edxahdlkim thead, #edxahdlkim tbody, #edxahdlkim tfoot, #edxahdlkim tr, #edxahdlkim td, #edxahdlkim th { border-style: none; } #edxahdlkim p { margin: 0; padding: 0; } #edxahdlkim .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #edxahdlkim .gt_caption { padding-top: 4px; padding-bottom: 4px; } #edxahdlkim .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #edxahdlkim .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #edxahdlkim .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #edxahdlkim .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #edxahdlkim .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #edxahdlkim .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #edxahdlkim .gt_column_spanner_outer:first-child { padding-left: 0; } #edxahdlkim .gt_column_spanner_outer:last-child { padding-right: 0; } #edxahdlkim .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #edxahdlkim .gt_spanner_row { border-bottom-style: hidden; } #edxahdlkim .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #edxahdlkim .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #edxahdlkim .gt_from_md > :first-child { margin-top: 0; } #edxahdlkim .gt_from_md > :last-child { margin-bottom: 0; } #edxahdlkim .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #edxahdlkim .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #edxahdlkim .gt_row_group_first td { border-top-width: 2px; } #edxahdlkim .gt_row_group_first th { border-top-width: 2px; } #edxahdlkim .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #edxahdlkim .gt_first_summary_row.thick { border-top-width: 2px; } #edxahdlkim .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #edxahdlkim .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #edxahdlkim .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #edxahdlkim .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #edxahdlkim .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_left { text-align: left; } #edxahdlkim .gt_center { text-align: center; } #edxahdlkim .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #edxahdlkim .gt_font_normal { font-weight: normal; } #edxahdlkim .gt_font_bold { font-weight: bold; } #edxahdlkim .gt_font_italic { font-style: italic; } #edxahdlkim .gt_super { font-size: 65%; } #edxahdlkim .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #edxahdlkim .gt_asterisk { font-size: 100%; vertical-align: 0; } #edxahdlkim .gt_indent_1 { text-indent: 5px; } #edxahdlkim .gt_indent_2 { text-indent: 10px; } #edxahdlkim .gt_indent_3 { text-indent: 15px; } #edxahdlkim .gt_indent_4 { text-indent: 20px; } #edxahdlkim .gt_indent_5 { text-indent: 25px; } Trust in Government Trust in People Always Most ofthe time About halfthe time Some ofthe time Never Always 0.277 0.008 0.015 0.020 0.015 Most of the time 0.185 0.175 0.093 0.113 0.072 About half the time 0.198 0.303 0.410 0.235 0.173 Some of the time 0.286 0.438 0.399 0.512 0.238 Never 0.053 0.077 0.083 0.120 0.503 The second option is to use group_by() and survey_mean() functions to calculate the proportions from the ANES design object. A reminder that with more than one variable listed in the group_by() statement, the proportions are within the first variable listed. As mentioned above, we are looking at the distribution of TrustGovernment for each level of TrustPeople. chi_ex2_obs &lt;- anes_des %&gt;% filter(!is.na(TrustPeople), !is.na(TrustGovernment)) %&gt;% group_by(TrustPeople, TrustGovernment) %&gt;% summarize(Observed = round(survey_mean(vartype = &quot;ci&quot;), 3)) chi_ex2_obs %&gt;% mutate(prop = paste0(Observed, &quot; (&quot;, Observed_low, &quot;, &quot;, Observed_upp, &quot;)&quot;)) %&gt;% select(TrustGovernment, TrustPeople, prop) %&gt;% pivot_wider(names_from = TrustPeople, values_from = prop) %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% tab_stubhead(label = &quot;Trust in Government&quot;) %&gt;% tab_spanner(label = &quot;Trust in People&quot;, columns = everything()) %&gt;% tab_options(page.orientation = &quot;landscape&quot;) #jslvphoojc table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #jslvphoojc thead, #jslvphoojc tbody, #jslvphoojc tfoot, #jslvphoojc tr, #jslvphoojc td, #jslvphoojc th { border-style: none; } #jslvphoojc p { margin: 0; padding: 0; } #jslvphoojc .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #jslvphoojc .gt_caption { padding-top: 4px; padding-bottom: 4px; } #jslvphoojc .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #jslvphoojc .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #jslvphoojc .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #jslvphoojc .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #jslvphoojc .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #jslvphoojc .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #jslvphoojc .gt_column_spanner_outer:first-child { padding-left: 0; } #jslvphoojc .gt_column_spanner_outer:last-child { padding-right: 0; } #jslvphoojc .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #jslvphoojc .gt_spanner_row { border-bottom-style: hidden; } #jslvphoojc .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #jslvphoojc .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #jslvphoojc .gt_from_md > :first-child { margin-top: 0; } #jslvphoojc .gt_from_md > :last-child { margin-bottom: 0; } #jslvphoojc .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #jslvphoojc .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #jslvphoojc .gt_row_group_first td { border-top-width: 2px; } #jslvphoojc .gt_row_group_first th { border-top-width: 2px; } #jslvphoojc .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #jslvphoojc .gt_first_summary_row.thick { border-top-width: 2px; } #jslvphoojc .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #jslvphoojc .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #jslvphoojc .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #jslvphoojc .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #jslvphoojc .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_left { text-align: left; } #jslvphoojc .gt_center { text-align: center; } #jslvphoojc .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #jslvphoojc .gt_font_normal { font-weight: normal; } #jslvphoojc .gt_font_bold { font-weight: bold; } #jslvphoojc .gt_font_italic { font-style: italic; } #jslvphoojc .gt_super { font-size: 65%; } #jslvphoojc .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #jslvphoojc .gt_asterisk { font-size: 100%; vertical-align: 0; } #jslvphoojc .gt_indent_1 { text-indent: 5px; } #jslvphoojc .gt_indent_2 { text-indent: 10px; } #jslvphoojc .gt_indent_3 { text-indent: 15px; } #jslvphoojc .gt_indent_4 { text-indent: 20px; } #jslvphoojc .gt_indent_5 { text-indent: 25px; } Trust in Government Trust in People Always Most of the time About half the time Some of the time Never Always 0.277 (0.11, 0.444) 0.008 (0.004, 0.012) 0.015 (0.006, 0.024) 0.02 (0.008, 0.033) 0.015 (0, 0.029) Most of the time 0.185 (-0.009, 0.38) 0.175 (0.157, 0.192) 0.093 (0.078, 0.109) 0.113 (0.085, 0.141) 0.072 (0.021, 0.123) About half the time 0.198 (0.046, 0.35) 0.303 (0.281, 0.324) 0.41 (0.378, 0.441) 0.235 (0.2, 0.271) 0.173 (0.099, 0.246) Some of the time 0.286 (0.069, 0.503) 0.438 (0.415, 0.462) 0.399 (0.365, 0.433) 0.512 (0.481, 0.543) 0.238 (0.178, 0.298) Never 0.053 (-0.01, 0.117) 0.077 (0.064, 0.089) 0.083 (0.063, 0.103) 0.12 (0.097, 0.142) 0.503 (0.422, 0.583) Both methods produce the same output as the svychisq() function does account for the survey design. However, calculating the proportions directly from the design object means that we can also obtain the variance information. In this case, the table output displays the survey estimate followed by the confidence intervals. Based on the output, we can see that of those that never trust people, 50.3% also never trust the government, while the proportions of never trusting the government are much lower for each of the other levels of trusting people. We may find it easier to look at these proportions graphically. We can use ggplot() and facets to provide an overview: chi_ex2_obs %&gt;% mutate(TrustPeople=str_c(&quot;Trust in People:\\n&quot;, TrustPeople)) %&gt;% ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) + facet_wrap( ~ TrustPeople, ncol = 5) + geom_point() + geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) + ylab(&quot;Proportion&quot;) + xlab(&quot;&quot;) + theme_bw() + scale_color_manual(name=&quot;Trust in Government&quot;, values=book_colors) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = &quot;bottom&quot;) Example 3: Test of Homogeneity Researchers and politicians often look at specific demographics each election cycle to understand how each group is leaning or voting towards candidates. The ANES data is post-election, but we can still see if there are differences in how specific demographic groups voted. If we want to see if there is a difference in how each age group voted for the 2020 candidates, this would be a test of homogeneity, and the hypothesis could be set up as follows: \\[\\begin{align*} H_0: p_{1_{Biden}} &amp;= p_{1_{Trump}} = p_{1_{Other}},\\\\ p_{2_{Biden}} &amp;= p_{2_{Trump}} = p_{2_{Other}},\\\\ p_{3_{Biden}} &amp;= p_{3_{Trump}} = p_{3_{Other}},\\\\ p_{4_{Biden}} &amp;= p_{4_{Trump}} = p_{4_{Other}},\\\\ p_{5_{Biden}} &amp;= p_{5_{Trump}} = p_{5_{Other}},\\\\ p_{6_{Biden}} &amp;= p_{6_{Trump}} = p_{6_{Other}} \\end{align*}\\] where \\(p_{i_{Biden}}\\) is the observed proportion of each age group (\\(i\\)) that voted for Biden, \\(p_{i_{Trump}}\\) is the observed proportion of each age group (\\(i\\)) that voted for Trump, and \\(p_{i_{Other}}\\) is the observed proportion of each age group (\\(i\\)) that voted for another candidate \\(H_A:\\) at least one category of \\(p_{i_{Biden}}\\) does not match \\(p_{i_{Trump}}\\) or \\(p_{i_{Other}}\\) To conduct this in R, we use the svychisq() function to compare the two variables: chi_ex3 &lt;- anes_des %&gt;% filter(VotedPres2020 == &quot;Yes&quot; &amp; !is.na(VotedPres2020_selection) &amp; !is.na(AgeGroup)) %&gt;% svychisq( formula = ~ AgeGroup + VotedPres2020_selection, design = ., statistic = &quot;Chisq&quot;, na.rm = TRUE ) chi_ex3 ## ## Pearson&#39;s X^2: Rao &amp; Scott adjustment ## ## data: NextMethod() ## X-squared = 169, df = 10, p-value &lt;2e-16 The output from svychisq() indicates a difference in how each age group voted in the 2020 election. To get a better idea of the different distributions, let’s output proportions to see the relationship. As we learned in Example 2 above, we can use chi_ex3$observed, or if we want to get the variance information (which is crucial with survey data), we can use survey_mean(). Remember, when we have two variables in group_by(), we obtain the proportions within each level of the variable listed. In this case, we are looking at the distribution of AgeGroup for each level of VotedPres2020_selection. chi_ex3_obs &lt;- anes_des %&gt;% filter(VotedPres2020 == &quot;Yes&quot; &amp; !is.na(VotedPres2020_selection) &amp; !is.na(AgeGroup)) %&gt;% group_by(VotedPres2020_selection, AgeGroup) %&gt;% summarize(Observed = round(survey_mean(vartype = &quot;ci&quot;), 3)) chi_ex3_obs %&gt;% mutate(prop = paste0(Observed, &quot; (&quot;, Observed_low, &quot;, &quot;, Observed_upp, &quot;)&quot;)) %&gt;% select(AgeGroup, VotedPres2020_selection, prop) %&gt;% pivot_wider(names_from = VotedPres2020_selection, values_from = prop) %&gt;% gt(rowname_col = &quot;AgeGroup&quot;) %&gt;% tab_stubhead(label = &quot;Age Group&quot;) #uphlolqabb table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #uphlolqabb thead, #uphlolqabb tbody, #uphlolqabb tfoot, #uphlolqabb tr, #uphlolqabb td, #uphlolqabb th { border-style: none; } #uphlolqabb p { margin: 0; padding: 0; } #uphlolqabb .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #uphlolqabb .gt_caption { padding-top: 4px; padding-bottom: 4px; } #uphlolqabb .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #uphlolqabb .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #uphlolqabb .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #uphlolqabb .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #uphlolqabb .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #uphlolqabb .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #uphlolqabb .gt_column_spanner_outer:first-child { padding-left: 0; } #uphlolqabb .gt_column_spanner_outer:last-child { padding-right: 0; } #uphlolqabb .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #uphlolqabb .gt_spanner_row { border-bottom-style: hidden; } #uphlolqabb .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #uphlolqabb .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #uphlolqabb .gt_from_md > :first-child { margin-top: 0; } #uphlolqabb .gt_from_md > :last-child { margin-bottom: 0; } #uphlolqabb .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #uphlolqabb .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #uphlolqabb .gt_row_group_first td { border-top-width: 2px; } #uphlolqabb .gt_row_group_first th { border-top-width: 2px; } #uphlolqabb .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #uphlolqabb .gt_first_summary_row.thick { border-top-width: 2px; } #uphlolqabb .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #uphlolqabb .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #uphlolqabb .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #uphlolqabb .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #uphlolqabb .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_left { text-align: left; } #uphlolqabb .gt_center { text-align: center; } #uphlolqabb .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #uphlolqabb .gt_font_normal { font-weight: normal; } #uphlolqabb .gt_font_bold { font-weight: bold; } #uphlolqabb .gt_font_italic { font-style: italic; } #uphlolqabb .gt_super { font-size: 65%; } #uphlolqabb .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #uphlolqabb .gt_asterisk { font-size: 100%; vertical-align: 0; } #uphlolqabb .gt_indent_1 { text-indent: 5px; } #uphlolqabb .gt_indent_2 { text-indent: 10px; } #uphlolqabb .gt_indent_3 { text-indent: 15px; } #uphlolqabb .gt_indent_4 { text-indent: 20px; } #uphlolqabb .gt_indent_5 { text-indent: 25px; } Age Group Biden Trump Other 18-29 0.204 (0.177, 0.231) 0.114 (0.095, 0.133) 0.227 (0.15, 0.304) 30-39 0.169 (0.153, 0.185) 0.147 (0.123, 0.17) 0.306 (0.214, 0.398) 40-49 0.163 (0.146, 0.18) 0.157 (0.136, 0.178) 0.209 (0.128, 0.29) 50-59 0.154 (0.136, 0.173) 0.234 (0.207, 0.261) 0.107 (0.041, 0.172) 60-69 0.179 (0.16, 0.199) 0.192 (0.172, 0.213) 0.102 (0.025, 0.178) 70 or older 0.13 (0.118, 0.143) 0.156 (0.139, 0.174) 0.049 (0, 0.098) We can see that the age group distribution was younger for Biden and other candidates and older for Trump. For example, of those that voted for Biden, 20.4% were in the 18-29 age group, compared to only 11.4% of those that voted for Trump were in that age group. On the other side, 23.4% of those that voted for Trump were in the 50-59 age group compared to only 15.4% of those that voted for Biden. 6.4 Exercises Here are some exercises for practicing conducting t-tests using svyttest(): Using the RECS data, do more than 50% of U.S. households use AC (ACUsed)? ttest_solution1 &lt;- recs_des %&gt;% svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE) ttest_solution1 ## ## Design-based one-sample t-test ## ## data: ((ACUsed == TRUE) - 0.5) ~ 0 ## t = 45, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 0.3533 0.3861 ## sample estimates: ## mean ## 0.3697 Using the RECS data, does the average temperature that U.S. households set their thermostats to differ between the day and night in the winter (WinterTempDay and WinterTempNight)? ttest_solution2 &lt;- recs_des %&gt;% svyttest( design = ., formula = I(WinterTempDay - WinterTempNight) ~ 0, na.rm = TRUE ) ttest_solution2 ## ## Design-based one-sample t-test ## ## data: I(WinterTempDay - WinterTempNight) ~ 0 ## t = 31, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 1.787 2.028 ## sample estimates: ## mean ## 1.907 Using the ANES data, does the average age (Age) of those who voted for Biden in 2020 (VotedPres2020_selection) differ from those that voted for another candidate? ttest_solution3 &lt;- anes_des %&gt;% svyttest( design = ., formula = Age ~ I(VotedPres2020_selection == &quot;Biden&quot;), na.rm = TRUE ) ttest_solution3 ## ## Design-based t-test ## ## data: Age ~ I(VotedPres2020_selection == &quot;Biden&quot;) ## t = -6, df = 50, p-value = 2e-07 ## alternative hypothesis: true difference in mean is not equal to 0 ## 95 percent confidence interval: ## -4.824 -2.395 ## sample estimates: ## difference in mean ## -3.61 Here are some exercises for practicing conducting chi-squared tests using svygofchisq() and svychisq(): If you wanted to determine if the political party affiliation differed for males and females, what test would you use? Goodness of fit test (svygofchisq()) Test of independence (svychisq()) Test of homogeneity (svychisq()) chisq_solution1 &lt;- &quot;c. Test of homogeneity (`svychisq()`)&quot; chisq_solution1 ## [1] &quot;c. Test of homogeneity (`svychisq()`)&quot; In the RECS data, is there a relationship between the type of housing unit (HousingUnitType) and the year the house was built (YearMade)? chisq_solution2 &lt;- recs_des %&gt;% svychisq( formula = ~ HousingUnitType + YearMade, design = ., statistic = &quot;Wald&quot;, na.rm = TRUE ) chisq_solution2 ## ## Design-based Wald test of association ## ## data: NextMethod() ## F = 32, ndf = 28, ddf = 95, p-value &lt;2e-16 In the ANES data, is there a difference in the distribution of gender (Gender) across early voting status in 2020 (EarlyVote2020)? chisq_solution3 &lt;- anes_des %&gt;% svychisq( formula = ~ Gender + EarlyVote2020, design = ., statistic = &quot;F&quot;, na.rm = TRUE ) chisq_solution3 ## ## Pearson&#39;s X^2: Rao &amp; Scott adjustment ## ## data: NextMethod() ## F = 0.27, ndf = 1, ddf = 51, p-value = 0.6 References "],["c07-modeling.html", "Chapter 7 Modeling 7.1 Introduction 7.2 Analysis of Variance (ANOVA) 7.3 Gaussian Linear regression 7.4 Logistic regression 7.5 Exercises", " Chapter 7 Modeling Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 7.1 Introduction Modeling data is a way for researchers to investigate the relationship between a single dependent variable and one or more independent variables. This builds upon the analyses conducted in Chapter 6, which looked at the relationships between just two variables. For example, in Example 3 in Section 6.2.2, we investigated if there is a relationship between the electrical bill cost and whether or not the household used air-conditioning. However, there are potentially other elements that could go into what the cost of electrical bill is in a household (e.g., outside temperature, desired internal temperature, types and number of appliances, etc.). T-tests only allow us to investigate the relationship of one independent variable at a time, but using models we can look into multiple variables and even explore interactions between these variables. There are several types of models, but in this chapter we will cover Analysis of Variance (ANOVA) and linear regression models following common Gaussian and logit distributions. Jonas Kristoffer Lindeløv has an interesting discussion of many statistical tests and models being equivalent to a linear model. For example, a one-way ANOVA is a linear model with one categorical independent variable, and a two-sample t-test is an ANOVA where the independent variable has exactly two levels. When modeling data, it is helpful to first create an equation that provides an overview as to what it is that we are modeling. The main structure of these models is as follows: \\[y_i=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i + \\epsilon_i\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_p\\) as the associated coefficients, and \\(\\epsilon_i\\) is the error. Different models may not include an intercept, have interactions between different independent variables (\\(x_i\\)), or may have different underlying structures for the dependent variable (\\(y_i\\)). However, all linear models have the independent variables related to the dependent variable in a linear form. To specify these models in R, the formulas are the same with both raw data and survey data. The left side of the formula is the response/dependent variable, and the right side of the formula has the predictor/independent variable(s). There are many symbols used in R to specify the formula. For example, a linear formula mathematically specified as \\[Y_i=\\beta_0+\\beta_1 X_i+\\epsilon_i\\] would be specified in R as y~x where the intercept is not explicitly included. To fit a model with no intercept, that is, \\[Y_i=\\beta_1 X_i+\\epsilon_i\\] it can be specified as y~x-1. Formula notation details in R can be found in the help file for formula21. A quick overview of the common formula notation is in the following table: Common symbols in formula notation Symbol Example Meaning + +X include this variable - -X delete this variable : X:Z include the interaction between these variables * X*Z include these variables and the interactions between them ^n (X+Z+Y)^3 include these variables and all interactions up to n way I I(X-Z) as-as: include a new variable which is the difference of these variables There are often multiple ways to specify the same formula. For example, consider the following equation using the mtcars data \\[mpg_i=\\beta_0+\\beta_1cyl_{i}+\\beta_2disp_{i}+\\beta_3hp_{i}+\\beta_4cyl_{i}disp_{i}+\\beta_5cyl_{i}hp_{i}+\\beta_6disp_{i}hp_{i}+\\epsilon_i\\] This could be specified as any of the following: mpg~(cyl+disp+hp)^2 mpg~cyl+disp+hp+cyl:disp+cyl:hp+disp:hp mpg~cyl*disp+cyl*hp+disp*hp Note that the following two specifications are not the same: mpg~cyl:disp:hp this only has the interactions and not the main effect mpg~cyl*disp*hp this also has the 3-way interaction in addition to the main effects and 2-way interactions When using raw data, researchers will use the glm() function. With survey data, however, we use svyglm() from the {survey} package to ensure that we account for the survey design and weights in modeling22. This allows us to generalize a model to the target population and accounts for the fact that the observations in the survey data may not be independent. As discussed in Chapter 6, modeling survey data cannot be directly done in {srvyr}, but can be done in the {survey} (Lumley 2010, 2023) package. In this chapter, we will provide syntax and examples for linear models, including ANOVA, Gaussian linear regression, and logistic regression. For details on other types of regression, including ordinal regression, log-linear models, and survival analysis, refer to Lumley (2010). Lumley (2010) also discusses custom models such as a negative binomial or Poisson model in Appendix E of his book. 7.2 Analysis of Variance (ANOVA) In ANOVA, we are testing whether the mean of an outcome is the same across two or more groups. Statistically, we set up this as follows: \\(H_0: \\mu_1 = \\mu_2= \\dots = \\mu_k\\) where \\(\\mu_i\\) is the mean outcome for group \\(i\\) \\(H_A: \\text{At least one mean is different}\\) Some assumptions when using ANOVA on survey data include: The outcome variable is normally distributed within each group The variances of the outcome variable between each group are approximately equal We do NOT assume independence between the groups as with general ANOVA. The covariance is accounted for in the survey design 7.2.1 Syntax To perform this type of test in R, the general syntax is as follows: des_obj %&gt;% svyglm( design = ., outcomevar ~ groupvar, na.action = na.omit, df.resid = degf(.) ) where des_obj is a design object, outcomevar is the outcome variable, groupvar is the group variable, and na.action=na.omit is set so that records with missing data in the outcome or group variable are removed for prediction23. The function svyglm() does not have the design as the first argument so the dot (.) notation is used to pass it with a pipe (see Chapter 6 for more details). 7.2.2 Example Looking at an example will help us discuss the output and how to interpret the results. In RECS, respondents are asked what temperature they set their thermostat to during the day and evening when using the air-conditioning during the summer. To analyze this data, we filter the respondents to only those using AC (ACUsed). Then if we want to see if there are differences by region, we can use group_by(). A descriptive analysis of the temperature at night (SummerTempNight) set by region and the sample sizes is displayed below. recs_des %&gt;% filter(ACUsed) %&gt;% group_by(Region) %&gt;% summarise( SMN = survey_mean(SummerTempNight, na.rm = TRUE), n = unweighted(n()), n_na = unweighted(sum(is.na(SummerTempNight))) ) ## # A tibble: 4 × 5 ## Region SMN SMN_se n n_na ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; ## 1 Northeast 70.5 0.197 682 0 ## 2 Midwest 71.4 0.151 1235 0 ## 3 South 72.0 0.118 1917 0 ## 4 West 72.1 0.299 1115 0 In the following code, we test whether this temperature varies by region by first using svyglm() to run the test and then using broom::tidy() to display the output. Note that the temperature setting is set to NA when the household does not use air-conditioning, and thus na.action=na.omit is specified to ignore these cases. anova_out &lt;- recs_des %&gt;% svyglm(design = ., formula = SummerTempNight ~ Region, na.action = na.omit) tidy(anova_out) ## # A tibble: 4 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 70.5 0.197 358. 1.84e-146 ## 2 RegionMidwest 0.874 0.253 3.46 8.18e- 4 ## 3 RegionSouth 1.49 0.231 6.45 5.20e- 9 ## 4 RegionWest 1.66 0.353 4.70 9.27e- 6 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient. In this output, the intercept represents the reference value of the Northeast region24. The other coefficients indicate the difference in temperature relative to the Northeast region. For example, in the Midwest, temperatures are set, on average, 0.874 degrees higher than in the Northeast during summer nights. 7.3 Gaussian Linear regression Gaussian linear regression is a more generalized method than ANOVA where we fit a model of a continuous outcome with any number of categorical or continuous predictors, such that \\[y_i=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i + \\epsilon_i\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_p\\) as the associated coefficients, and \\(\\epsilon_i\\) is the error. Assumptions in Gaussian linear regression using survey data include: The residuals (\\(\\epsilon_i\\)) are normally distributed, but there is not an assumption of independence, and the correlation structure is captured in the survey design object There is a linear relationship between the outcome variable and the independent variables The residuals are homoscedastic, that is, the error term is the same across all values of independent variables 7.3.1 Syntax The syntax for linear regression uses the same function as ANOVA, but can have more than one variable listed on the right-hand side of the formula: des_obj %&gt;% svyglm( design = ., outcomevar ~ x1 + x2 + x3, na.action = na.omit, df.resid = degf(.) ) As discussed at the beginning of the chapter, the formula on the right-hand side can be specified in many ways, whether interactions are desired or not, for example. 7.3.2 Example On RECS, we can obtain information on the square footage of homes and the electric bills. We assume that square footage is related to the amount of money spent on electricity and examine a model for this. Before any modeling, we first plot the data to determine whether it is reasonable to assume a linear relationship. In the plot below, each hexagon represents the weighted count of households in the bin and we can see a general positive linear trend (as the square footage increases so does the amount of money spent on electricity). FIGURE 7.1: Relationship between square footage and dollars spent on electricity, RECS 2015 Given that the plot shows a potential relationship, fitting a model will allow us to determine if the relationship is statistically significant. The model is fit below with electricity expenditure as the outcome. m_electric_sqft &lt;- recs_des %&gt;% svyglm(design = ., formula = DOLLAREL ~ TOTSQFT_EN, na.action = na.omit) tidy(m_electric_sqft) ## # A tibble: 2 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 880. 26.3 33.4 5.51e-54 ## 2 TOTSQFT_EN 0.246 0.0134 18.4 6.01e-33 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient. In these results, we can say that, on average, for every additional square foot of house size, the electricity bill increases by 24.6 cents and that square footage is significantly associated with electricity expenditure. This is a very simple model, and there are likely many more factors in electricity expenditure, including the type of cooling, number of appliances, location, and more. However, often starting with one variable models can help researchers understand what potential relationships there are between variables before fitting more complex models. Often researchers start with known relationships before building models to determine what impact additional variables have on the model. In the following example, a model is fit to predict electricity expenditure, including Census region (factor/categorical), urbanicity (factor/categorical), square footage (double/numeric), and whether air-conditioning is used (logical/categorical) with all two-way interactions also included. As a reminder, using -1 means that we are fitting this model without an intercept. m_electric_multi &lt;- recs_des %&gt;% svyglm( design = ., formula = DOLLAREL ~ (Region + Urbanicity + TOTSQFT_EN + ACUsed)^2 - 1, na.action = na.omit ) tidy(m_electric_multi) %&gt;% print(n = 50) ## # A tibble: 25 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 RegionNortheast 6.19e+2 94.8 6.53 8.42e- 9 ## 2 RegionMidwest 4.98e+2 79.4 6.27 2.47e- 8 ## 3 RegionSouth 8.55e+2 97.4 8.78 6.01e-13 ## 4 RegionWest 4.76e+2 60.8 7.83 3.49e-11 ## 5 UrbanicityUrban Cluster -1.18e+2 116. -1.02 3.12e- 1 ## 6 UrbanicityRural 2.22e+2 107. 2.07 4.16e- 2 ## 7 TOTSQFT_EN 1.96e-1 0.0474 4.13 9.69e- 5 ## 8 ACUsedTRUE 2.85e+2 105. 2.72 8.30e- 3 ## 9 RegionMidwest:UrbanicityUrban … 3.16e+2 101. 3.13 2.51e- 3 ## 10 RegionSouth:UrbanicityUrban Cl… 4.00e+1 107. 0.373 7.10e- 1 ## 11 RegionWest:UrbanicityUrban Clu… 2.39e+2 112. 2.13 3.65e- 2 ## 12 RegionMidwest:UrbanicityRural 4.68e+2 89.8 5.21 1.77e- 6 ## 13 RegionSouth:UrbanicityRural 1.74e+2 99.2 1.76 8.31e- 2 ## 14 RegionWest:UrbanicityRural 1.64e+2 105. 1.57 1.21e- 1 ## 15 RegionMidwest:TOTSQFT_EN -5.24e-2 0.0364 -1.44 1.55e- 1 ## 16 RegionSouth:TOTSQFT_EN 6.08e-2 0.0426 1.43 1.58e- 1 ## 17 RegionWest:TOTSQFT_EN 3.50e-2 0.0441 0.793 4.30e- 1 ## 18 RegionMidwest:ACUsedTRUE -1.34e+2 110. -1.22 2.26e- 1 ## 19 RegionSouth:ACUsedTRUE -1.36e+2 106. -1.29 2.02e- 1 ## 20 RegionWest:ACUsedTRUE -6.26e+1 100. -0.625 5.34e- 1 ## 21 UrbanicityUrban Cluster:TOTSQF… -2.09e-2 0.0337 -0.619 5.38e- 1 ## 22 UrbanicityRural:TOTSQFT_EN -7.89e-2 0.0269 -2.93 4.56e- 3 ## 23 UrbanicityUrban Cluster:ACUsed… -3.37e+1 88.5 -0.380 7.05e- 1 ## 24 UrbanicityRural:ACUsedTRUE 3.10e+0 89.7 0.0345 9.73e- 1 ## 25 TOTSQFT_EN:ACUsedTRUE 5.20e-2 0.0371 1.40 1.65e- 1 As shown above, there are many terms in this model. To test whether coefficients for a term are different from zero, the function regTermTest() can be used. For example, in the above regression, we can test whether the interaction of region and urbanicity is significant as follows: urb_reg_test &lt;- regTermTest(m_electric_multi, ~Urbanicity:Region) urb_reg_test ## Wald test for Urbanicity:Region ## in svyglm(design = ., formula = DOLLAREL ~ (Region + Urbanicity + ## TOTSQFT_EN + ACUsed)^2 - 1, na.action = na.omit) ## F = 7.167 on 6 and 71 df: p= 5.3e-06 As demonstrated above, there is a significant interaction between urbanicity and region (p-value=5.34^{-6}). To examine the predictions, residuals and more from the model, the function augment() from {broom} can be used. The augment() function will return a tibble with the independent and dependent variables and other fit statistics. The augment() function has not been specifically written for objects of class svyglm, and as such, a warning will be displayed indicating this at this time. As it was not written exactly for this class of objects, a little tweaking needs to be done after using augment to get the predicted (.fitted) and standard error (.se.fit) values. To obtain the standard error of the fitted values we need to use the attr() function on the .fitted values created by augment(). fitstats &lt;- augment(m_electric_multi) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), .fitted = as.numeric(.fitted)) ## Warning: The `augment()` method for objects of class `svrepglm` is not maintained by the broom team, and is only supported through the `glm` tidier method. Please be cautious in interpreting and reporting broom output. ## ## This warning is displayed once per session. fitstats ## # A tibble: 5,686 × 13 ## DOLLAREL Region Urbanicity TOTSQFT_EN ACUsed `(weights)` .fitted ## &lt;dbl&gt; &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 967. West Urban Area 1808 TRUE 0.581 1209. ## 2 1366 South Rural 759 TRUE 0.693 1577. ## 3 2428. South Urban Area 2288 TRUE 1.12 1710. ## 4 1313. Midwest Urban Clust… 2573 TRUE 0.585 1261. ## 5 581. Northeast Urban Area 1024 TRUE 0.804 1157. ## 6 494 Northeast Urban Clust… 623 TRUE 1.25 893. ## 7 372. South Urban Area 832 TRUE 2.21 1261. ## 8 1737. Midwest Rural 2682 TRUE 0.735 1654. ## 9 1532. Northeast Urban Area 1900 TRUE 0.675 1375. ## 10 1974. South Urban Area 1230 TRUE 2.09 1383. ## # ℹ 5,676 more rows ## # ℹ 6 more variables: .resid &lt;dbl&gt;, .hat &lt;dbl&gt;, .sigma &lt;dbl&gt;, ## # .cooksd &lt;dbl&gt;, .std.resid &lt;dbl&gt;, .se.fit &lt;dbl&gt; These results can then be used in a variety of ways, including examining residual plots as illustrated below: fitstats %&gt;% ggplot(aes(x = .fitted, .resid)) + geom_point() + geom_hline(yintercept = 0, colour = &quot;red&quot;) + theme_minimal() + xlab(&quot;Fitted value of electricity cost&quot;) + ylab(&quot;Residual of model&quot;) FIGURE 7.2: Residual plot of electric cost model with covariates Region, Urbanicity, TOTSQFT_EN, and ACUsed Additionally, augment() can be used to predict outcomes for data not used in modeling. Perhaps, we would like to predict the energy expenditure for a home in an urban area in the south that uses air-conditioning and is 2,500 square feet. First, make a tibble including that additional data and then use the newdata argument in the augment function. As before, to obtain the standard error of the predicted values we need to use the attr() function. add_data &lt;- recs_in %&gt;% select(DOEID, Region, Urbanicity, TOTSQFT_EN, ACUsed, DOLLAREL) %&gt;% rbind( tibble( DOEID = NA, Region = &quot;South&quot;, Urbanicity = &quot;Urban Area&quot;, TOTSQFT_EN = 2500, ACUsed = TRUE, DOLLAREL = NA ) ) %&gt;% tail(1) pred_data &lt;- augment(m_electric_multi, newdata = add_data) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), .fitted = as.numeric(.fitted)) pred_data ## # A tibble: 1 × 8 ## DOEID Region Urbanicity TOTSQFT_EN ACUsed DOLLAREL .fitted .se.fit ## &lt;dbl&gt; &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 NA South Urban Area 2500 TRUE NA 1775. 37.2 In the above example, it is predicted that the energy expenditure would be $1775.48. 7.4 Logistic regression Logistic regression is used to model a binary outcome and is a specific generalized linear model (GLM). A GLM uses a link function to link the response variable to the linear model. In logistic regression, the link model is the logit function. Specifically, the model is specified as follows: \\[ y_i \\sim \\text{Bernoulli}(\\pi_i)\\] \\[\\begin{equation} \\log \\left(\\frac{\\pi_i}{1-\\pi_i} \\right)=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\tag{7.1} \\end{equation}\\] which can be re-expressed as \\[ \\pi_i=\\frac{\\exp \\left(\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\right)}{1+\\exp \\left(\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\right)}.\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, and \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_n\\) as the associated coefficients. Assumptions in logistic regression using survey data include: The outcome variable has two levels There is a linear relationship between the independent variables and the log odds (Equation (7.1)) The residuals are homoscedastic, that is, the error term is the same across all values of independent variables 7.4.1 Syntax The syntax for logistic regression is as follows: des_obj %&gt;% svyglm( design = ., outcomevar ~ x1 + x2 + x3, na.action = na.omit, df.resid = degf(.), family = quasibinomial #use this to avoid warning about non-integers ) Note svyglm() is the same function used in both ANOVA and linear regression. However, we’ve added the link function quasibinomial. While we can use the binomial link function, it is recommended to use the quasibinomial as our weights may not be integers, and the quasibinomial also allows for overdispersion. The quasibinomial family has a default logit link which is what is specified in the equations above. When specifying the outcome variable, it will likely be specified in one of two ways with survey data: A factor variable where not being the first level of the factor indicates a “success” A numeric variable which is 1 or 0 where 1 indicates a success A logical variable where TRUE indicates a success 7.4.2 Example In the following example, the ANES data is used, and we are modeling whether someone usually has trust in the government25 by who someone voted for in 2020. As a reminder, the leading candidates were Biden and Trump though people could vote for someone else not in the Democratic or Republican parties. Those votes are all grouped into an “Other” category. We first create a binary outcome for trusting in the government and plot the data. A scatter plot of the raw data is not useful as it is all 0 and 1 outcomes, so instead, we plot a summary of the data. anes_des_der &lt;- anes_des %&gt;% mutate(TrustGovernmentUsually = case_when( is.na(TrustGovernment) ~ NA, TRUE ~ TrustGovernment %in% c(&quot;Always&quot;, &quot;Most of the time&quot;) )) anes_des_der %&gt;% group_by(VotedPres2020_selection) %&gt;% summarise( pct_trust = survey_mean( TrustGovernmentUsually, na.rm = TRUE, proportion = TRUE, vartype = &quot;ci&quot; ), .groups = &quot;drop&quot; ) %&gt;% filter(complete.cases(.)) %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) + scale_fill_manual(values = c(&quot;#0b3954&quot;, &quot;#bfd7ea&quot;, &quot;#8d6b94&quot;)) + xlab(&quot;Election choice (2022)&quot;) + ylab(&quot;Usually trust the government&quot;) + scale_y_continuous(labels = scales::percent) + guides(fill = &quot;none&quot;) + theme_minimal() FIGURE 7.3: Relationship between candidate selection and trust in government, ANES 2020 Next, we fit the model. logistic_trust_vote &lt;- anes_des_der %&gt;% svyglm( design = ., formula = TrustGovernmentUsually ~ VotedPres2020_selection , family = quasibinomial ) tidy(logistic_trust_vote) ## # A tibble: 3 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) -1.96 0.0740 -26.5 1.10e-30 ## 2 VotedPres2020_selectionTrump 0.397 0.102 3.90 2.89e- 4 ## 3 VotedPres2020_selectionOther -0.768 0.462 -1.66 1.03e- 1 tidy(logistic_trust_vote, exponentiate = TRUE) %&gt;% select(term, estimate) ## # A tibble: 3 × 2 ## term estimate ## &lt;chr&gt; &lt;dbl&gt; ## 1 (Intercept) 0.141 ## 2 VotedPres2020_selectionTrump 1.49 ## 3 VotedPres2020_selectionOther 0.464 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient when the tidy() function is run the first time. The second time the tidy function is used, the coefficients are exponentiated, which illustrates the odds. In this example, we can interpret this as saying that the odds of trusting in government for someone who voted for Trump is 149% as likely to trust the government compared to a person who voted for Biden (the reference level). In comparison, a person who voted for neither Biden nor Trump is 46.4% as likely to trust the government as someone who voted for Biden. As with linear regression, the augment() can be used to predict values. By default, the prediction is the link function and not the probability. To predict the probability, add an argument of type.predict=\"response\" as demonstrated below: logistic_trust_vote %&gt;% augment(type.predict = &quot;response&quot;) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) %&gt;% select(TrustGovernmentUsually, VotedPres2020_selection, .fitted, .se.fit) ## Warning: The `augment()` method for objects of class `svyglm` is not maintained by the broom team, and is only supported through the `glm` tidier method. Please be cautious in interpreting and reporting broom output. ## ## This warning is displayed once per session. ## # A tibble: 5,860 × 4 ## TrustGovernmentUsually VotedPres2020_selection .fitted .se.fit ## &lt;lgl&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE Other 0.0614 0.0266 ## 2 FALSE Biden 0.124 0.00801 ## 3 FALSE Biden 0.124 0.00801 ## 4 FALSE Trump 0.173 0.00959 ## 5 FALSE Biden 0.124 0.00801 ## 6 FALSE Trump 0.173 0.00959 ## 7 FALSE Biden 0.124 0.00801 ## 8 TRUE Biden 0.124 0.00801 ## 9 FALSE Biden 0.124 0.00801 ## 10 FALSE Trump 0.173 0.00959 ## # ℹ 5,850 more rows 7.5 Exercises The type of housing unit may have an impact on energy expenses. Is there any relationship between housing unit type (HousingUnitType) and total energy expenditure (TOTALDOL)? First, find the average energy expenditure by housing unit type as a descriptive analysis and then do the test. The reference level in the comparison should be the housing unit type that is most common. recs_des %&gt;% group_by(HousingUnitType) %&gt;% summarise(Expense = survey_mean(TOTALDOL, na.rm = TRUE), HUs = survey_total()) %&gt;% arrange(desc(HUs)) ## # A tibble: 5 × 5 ## HousingUnitType Expense Expense_se HUs HUs_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Single-family detached 2193. 22.0 73871041. 0.00645 ## 2 Apartment: 5 or more units 1048. 21.2 21147792. 0.00271 ## 3 Apartment: 2-4 Units 1332. 36.6 9392242. 0.000239 ## 4 Single-family attached 1607. 33.8 7010132. 0.0253 ## 5 Mobile home 1752. 60.2 6787043. 0.0183 exp_unit_out &lt;- recs_des %&gt;% mutate(HousingUnitType = fct_infreq(HousingUnitType, NWEIGHT)) %&gt;% svyglm( design = ., formula = TOTALDOL ~ HousingUnitType, na.action = na.omit ) tidy(exp_unit_out) ## # A tibble: 5 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 2193. 22.0 99.6 1.07e-94 ## 2 HousingUnitTypeApartment: 5 or … -1145. 30.6 -37.4 5.08e-57 ## 3 HousingUnitTypeApartment: 2-4 U… -861. 42.2 -20.4 1.06e-35 ## 4 HousingUnitTypeSingle-family at… -585. 41.2 -14.2 8.02e-25 ## 5 HousingUnitTypeMobile home -441. 61.8 -7.13 2.33e-10 # Single-family detached units are most common # There is a significant relationship between energy expenditure and housing unit type Does temperature play a role in energy expenditure? Cooling degree days are a measure of how hot a place is. CDD65 for a given day indicates the number of degrees Fahrenheit warmer than 65°F (18.3°C) it is in a location. On a day that averages 65°F and below, CDD65=0. While a day that averages 85°F would have CDD80=20 because it is 20 degrees warmer. For each day in the year, this is summed to give an indicator of how hot the place is throughout the year. Similarly, HDD65 indicates the days colder than 65°F (18.3°C)26. Can energy expenditure be predicted using these temperature indicators along with square footage? Is there a significant relationship? Include main effects and two-way interactions. temps_sqft_exp &lt;- recs_des %&gt;% svyglm( design = ., formula = DOLLAREL ~ (TOTSQFT_EN + CDD65 + HDD65) ^ 2, na.action = na.omit ) tidy(temps_sqft_exp) ## # A tibble: 7 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 353. 110. 3.20 0.00193 ## 2 TOTSQFT_EN 0.420 0.0643 6.53 0.00000000398 ## 3 CDD65 0.128 0.0321 4.00 0.000128 ## 4 HDD65 0.0399 0.0172 2.32 0.0226 ## 5 TOTSQFT_EN:CDD65 -0.00000150 0.0000177 -0.0852 0.932 ## 6 TOTSQFT_EN:HDD65 -0.0000366 0.00000922 -3.97 0.000143 ## 7 CDD65:HDD65 0.0000232 0.00000835 2.78 0.00660 Continuing with our results from question 2, create a plot between the actual and predicted expenditures and a residual plot for the predicted expenditures. temps_sqft_exp_fit &lt;- temps_sqft_exp %&gt;% augment() %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) temps_sqft_exp_fit %&gt;% ggplot(aes(x = DOLLAREL, y = .fitted)) + geom_point() + geom_abline(intercept = 0, slope = 1, colour = &quot;red&quot;) + xlab(&quot;Actual expenditures&quot;) + ylab(&quot;Predicted expenditures&quot;) + theme_minimal() FIGURE 7.4: Actual and predicted electricity expenditures temps_sqft_exp_fit %&gt;% ggplot(aes(x = .fitted, y = .resid)) + geom_point() + geom_hline(yintercept = 0, colour = &quot;red&quot;) + xlab(&quot;Predicted expenditure&quot;) + ylab(&quot;Residual value of expenditure&quot;) + theme_minimal() FIGURE 7.5: Residual plot of electric cost model with covariates TOTSQFT_EN, CDD65, and HDD65 Early voting expanded in 202027. Build a logistic model predicting early voting in 2020 (EarlyVote2020) using age (Age), education (Education), and party identification (PartyID). Include two-way interactions. earlyvote_mod &lt;- anes_des %&gt;% filter(!is.na(EarlyVote2020)) %&gt;% svyglm( design = ., formula = EarlyVote2020 ~ (Age + Education + PartyID) ^ 2 , family = quasibinomial ) tidy(earlyvote_mod) %&gt;% arrange(p.value) ## # A tibble: 46 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Age:PartyIDIndependent -0.0585 0.0163 -3.58 0.0116 ## 2 PartyIDIndependent 4.99 1.62 3.09 0.0214 ## 3 Age:PartyIDNot very strong repu… -0.0494 0.0197 -2.51 0.0460 ## 4 PartyIDNot very strong republic… 4.03 1.64 2.46 0.0488 ## 5 (Intercept) 1.57 0.870 1.80 0.121 ## 6 EducationGraduate 1.47 0.973 1.51 0.183 ## 7 EducationHigh school:PartyIDStr… -1.33 1.00 -1.32 0.235 ## 8 PartyIDStrong republican 1.63 1.28 1.27 0.253 ## 9 EducationGraduate:PartyIDStrong… -1.21 0.998 -1.21 0.272 ## 10 EducationPost HS:PartyIDIndepen… -1.50 1.35 -1.11 0.311 ## # ℹ 36 more rows Continuing from Exercise 1, predict the probability of early voting for two people. Both are 28 years old and have a graduate degree, but one person is a strong Democrat, and the other is a strong Republican. add_vote_dat &lt;- anes_in %&gt;% select(EarlyVote2020, Age, Education, PartyID) %&gt;% rbind(tibble( EarlyVote2020 = NA, Age = 28, Education = &quot;Graduate&quot;, PartyID = c(&quot;Strong democrat&quot;, &quot;Strong republican&quot;) )) %&gt;% tail(2) log_ex_2_out &lt;- earlyvote_mod %&gt;% augment(newdata = add_vote_dat, type.predict = &quot;response&quot;) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) References "],["c08-communicating-results.html", "Chapter 8 Communicating Results 8.1 Introduction 8.2 Describing Results through Text 8.3 Visualizing Data 8.4 Reproducibility", " Chapter 8 Communicating Results Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(gt) library(gtsummary) library(BrailleR) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES. Here is the code to create the ANES design object that will be used throughout the chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) 8.1 Introduction One of the most important aspects of data analysis is communicating the results to others. This could include other researchers familiar with our survey data or others who will be seeing this data and results for the first time. Ensuring that we are accurately discussing the methodology, analysis, and displaying results is crucial to making sure our audience comprehends what the results are saying. It is our responsibility to discuss and present the results carefully. Before beginning any dissemination of results, it is important to understand the audience. Some questions we might consider about our audience include: What medium will results be presented? Examples might include website, print media. Based on the media type, we might limit or enhance the use of graphical representation. How much does the audience know about the study and/or data being presented? Audiences can range from the general public to data experts. If you don’t expect your audience to know much about the study, be sure to describe it (see later recommendations) What are we trying to communicate? This could be summary statistics, trends, patterns, and more. Summary statistics might be best presented in tables but trends and patterns might be better shared with plots. Is the audience accustomed to looking at plots? If not, we might want to add text to describe how to read plots What level of statistics knowledge does the audience have? If the audience does not have a strong statistics background, it could be useful to include text on standard errors, confidence intervals, and other estimate types that are being shared 8.2 Describing Results through Text As researchers, we often focus on the data itself; communicating the results effectively can be a forgotten step. However, all of the steps that we as researchers need to consider when conducting analyses must be communicated to our audience as well. The first few chapters of this book Chapters 2 through 4 provided insights into what we need to consider when conducting analyses. Each of these topics should also be considered when presenting results to others. 8.2.1 Methodology If we are using existing data, methodologically sound surveys will provide documentation about how the survey was fielded, the questionnaires, and the needed information for analyses. For example, the survey’s methodology reports should include the population of interest, sampling procedures, response rates, questionnaire documentation, weighting, and general overview of disclosure statements. Many American organizations are part of the American Association for Public Opinion Research’s (AAPOR) Transparency Initiative, which requires the organization to include specific details in their methodology to ensure that people understand the context in which analyses can and should be conducted from each survey. Being transparent about these methods is crucial for the scientific rigor of the field. When using publicly available data, such as with the examples in this book, oftentimes we can link to the methodology report in our final output. However, it is still important to provide the high-level information that will make it easy for the audience to understand at a quick glance the context around the findings. For example, indicating who (age or other population information), where the study was done, and when the study was done helps the audience understand how generalizable the results are. Including the question wording will also ensure that the audience understands the context and limitations if the response options are narrow. The details provided in Chapter 2 about what we as researchers need to consider when analyzing the data should also be provided to the audience when presenting the results. The inclusion of this material is especially important if no methodology report exists for the data used in the analyses. For example, if the researcher conducted the survey for the purposes of this analysis, then including as much information about the survey as possible in the write-up and dissemination of the findings is crucial. Following the AAPOR Transparency Initiative guidelines is a good way to ensure all necessary information is provided to the audience. 8.2.2 Analysis In addition to how the survey was conducted and weights were calculated, providing information about what data prep, cleaning, and analyses were used to obtain these results is also important. For example, in Chapter 6 we compared the distributions of education from the survey to the ACS. To do this, we needed to collapse education categories provided in the ANES data to match the ACS. Providing both the original question wording and response options and the steps taken to map to the ACS data are important for the audience to know to ensure transparency and a better understanding of the results. This particular example may seem obvious (combining Bachelor’s Degree and Graduate Degree into a single category). Still, there are cases where re-coding or handling of missing data is more important to disclose as there could be multiple ways to handle the data and the choice we made as researchers were just one of many. For example, many examples and exercises in this book remove missing data, as this is often the easiest way to handle missing data. However, in some cases, missing data could be a substantively important piece of information, and removing it could bias results. Disclosing how data was handled is crucial in helping the audience better understand the results. 8.2.3 Results Presenting and communicating results is more than just displaying a table with data or a nice-looking graph. Adding context around point estimates or model coefficients is important for helping the audience understand what the data mean. We, as researchers, can do a couple of things to help the audience understand the data. First, we can present the important data points in a sentence. For example, if we were looking at election polling data conducted before an election, we could say something like: As of [DATE], an estimated XX% of U.S. registered voters say they will vote for [CANDITATE NAME] for president in the [YEAR] general election. This sentence provides a few key pieces of information for the audience: [DATE]: Given that polling data is dependent on a point in time, providing the date of reference is important for understanding when this data is valid. U.S. registered voters: This is the target population, and by including this information, we are telling the audience the population for reference and who was surveyed. [CANDITATE NAME] for president: This provides the information on the estimate. The number is the percentage of those voting for a specific candidate for a specific office. [YEAR] general election: As with the bullet above, this information provides more context around the specific election and year. The estimate would take on a different meaning if we changed it to a primary election, for example. This sentence also includes the word “estimated.” When presenting results in aggregate from surveys, it is important not to talk about estimates in the absolute as we have errors around each estimate. Using words like “estimated,” “on average,” or “around” will help convey the uncertainty with a given value. Including that uncertainty is even more helpful to researchers. For example, a sentence could include the uncertainty with the margin of error such as “XX% (+/- Y%).” Confidence intervals can also be incorporated into the text to assist readers. Second, providing context and discussion around the meaning of the point can help the audience glean some insight into why the data is important. For example, when comparing two points, it could be helpful to indicate that there are statistically significant differences and the impact and usefulness of this information. This is where it is important as researchers to do our best to keep biases in check and present only the facts logically. If speculation is included, using statements like “the authors speculate” or “these findings may indicate” help relay the uncertainty around the notion while still lending a plausible solution. Additionally, researchers can present a few alternatives or competing discussion points to explain the results’ uncertainty further. It is important to remember that how we, as researchers, discuss these findings can greatly impact how the audience interprets the findings. Therefore, we should take extreme caution when talking about and presenting results. 8.3 Visualizing Data Including data tables and graphs are used to portray a large amount of data in a concise manner. Although discussing key findings in the text is important, it is often easier for the audience to digest large amounts of data in graphical or table format. When used correctly, combining text, tables, and graphs is extremely powerful in presenting results. This section provides examples of using the {gtsummary} and {ggplot} packages to enhance the dissemination of results. 8.3.1 Tables Tables are a great way to provide a large amount of data when we want the individual data points to be read. However, it is important to present tables in a readable format. Numbers should be aligned, and rows and columns should be easy to follow. Using key visualization techniques, we can create tables that are informative and nice to look at. Many packages can be used to create easy-to-read tables (e.g., {kable} + {kableExtra}, {gt}, {gtsummary}, {DT}, {formattable}, {flextable}, {reactable}). We will focus on {gt} here, but we encourage learning about others as they may have additional helpful components. We like the {gt} package as it is flexible, pipeable, and has many extensions to make beautiful tables. At this time, {gtsummary} is not well enough developed for recommended wide use for survey analysis. It lacks the ability to work with replicate designs. We provide one example using {gtsummary} and hope it can become a better tool over time. srvyr output to gt table Let’s start by using some of the data we calculated earlier in this book. In Chapter 6, we looked at data on trust in government with the proportions calculated below: trust_gov &lt;- anes_des %&gt;% drop_na(TrustGovernment) %&gt;% group_by(TrustGovernment) %&gt;% summarize(trust_gov_p = survey_prop()) trust_gov ## # A tibble: 5 × 3 ## TrustGovernment trust_gov_p trust_gov_p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Always 0.0155 0.00204 ## 2 Most of the time 0.132 0.00553 ## 3 About half the time 0.309 0.00829 ## 4 Some of the time 0.434 0.00855 ## 5 Never 0.110 0.00566 The native output that R produces may work for initial viewing inside RStudio or when creating basic output with an R Markdown or Quarto document. However, when viewing these results in other publications, such as the print version of this book or for more official dissemination, adjusting the display can make it easier for users to follow. Looking at the output from trust_gov, there are a couple of items that are probably obvious to fix: (1) use percentages instead of proportions and (2) the variable names as the column headers. The {gt} package is a good tool for both implementing this better labeling and creating tables publishable tables. In the code below, we implement a few changes to improve the table usefulness. We begin with the gt() function to initiate the table and use the argument rowname_col to make the TrustGovernment column as the table stub. The cols_label() function is used to create informative column labels. The function tab_spanner() is applied to add a label across multiple columns. In this case, we apply the label “Trust in Government, 2020” across all the columns except the stub. Finally, the fmt_percent() function is used to format the proportions into percentages and reduce the number of decimals shown. trust_gov_gt &lt;- trust_gov %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% cols_label(trust_gov_p = &quot;%&quot;, trust_gov_p_se = &quot;s.e. (%)&quot;) %&gt;% tab_spanner(label = &quot;Trust in Government, 2020&quot;, columns = c(trust_gov_p, trust_gov_p_se)) %&gt;% fmt_percent(decimals = 1) trust_gov_gt #smfkpkdnvz table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #smfkpkdnvz thead, #smfkpkdnvz tbody, #smfkpkdnvz tfoot, #smfkpkdnvz tr, #smfkpkdnvz td, #smfkpkdnvz th { border-style: none; } #smfkpkdnvz p { margin: 0; padding: 0; } #smfkpkdnvz .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #smfkpkdnvz .gt_caption { padding-top: 4px; padding-bottom: 4px; } #smfkpkdnvz .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #smfkpkdnvz .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #smfkpkdnvz .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #smfkpkdnvz .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #smfkpkdnvz .gt_column_spanner_outer:first-child { padding-left: 0; } #smfkpkdnvz .gt_column_spanner_outer:last-child { padding-right: 0; } #smfkpkdnvz .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #smfkpkdnvz .gt_spanner_row { border-bottom-style: hidden; } #smfkpkdnvz .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #smfkpkdnvz .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #smfkpkdnvz .gt_from_md > :first-child { margin-top: 0; } #smfkpkdnvz .gt_from_md > :last-child { margin-bottom: 0; } #smfkpkdnvz .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #smfkpkdnvz .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #smfkpkdnvz .gt_row_group_first td { border-top-width: 2px; } #smfkpkdnvz .gt_row_group_first th { border-top-width: 2px; } #smfkpkdnvz .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #smfkpkdnvz .gt_first_summary_row.thick { border-top-width: 2px; } #smfkpkdnvz .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #smfkpkdnvz .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #smfkpkdnvz .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_left { text-align: left; } #smfkpkdnvz .gt_center { text-align: center; } #smfkpkdnvz .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #smfkpkdnvz .gt_font_normal { font-weight: normal; } #smfkpkdnvz .gt_font_bold { font-weight: bold; } #smfkpkdnvz .gt_font_italic { font-style: italic; } #smfkpkdnvz .gt_super { font-size: 65%; } #smfkpkdnvz .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #smfkpkdnvz .gt_asterisk { font-size: 100%; vertical-align: 0; } #smfkpkdnvz .gt_indent_1 { text-indent: 5px; } #smfkpkdnvz .gt_indent_2 { text-indent: 10px; } #smfkpkdnvz .gt_indent_3 { text-indent: 15px; } #smfkpkdnvz .gt_indent_4 { text-indent: 20px; } #smfkpkdnvz .gt_indent_5 { text-indent: 25px; } Trust in Government, 2020 % s.e. (%) Always 1.6% 0.2% Most of the time 13.2% 0.6% About half the time 30.9% 0.8% Some of the time 43.4% 0.9% Never 11.0% 0.6% A few more things we can add are a title, a data source note, and a footnote with the question information using the functions tab_header(), tab_source_note(), and tab_footnote(). trust_gov_gt %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #pnruhcqurs table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #pnruhcqurs thead, #pnruhcqurs tbody, #pnruhcqurs tfoot, #pnruhcqurs tr, #pnruhcqurs td, #pnruhcqurs th { border-style: none; } #pnruhcqurs p { margin: 0; padding: 0; } #pnruhcqurs .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #pnruhcqurs .gt_caption { padding-top: 4px; padding-bottom: 4px; } #pnruhcqurs .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #pnruhcqurs .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #pnruhcqurs .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #pnruhcqurs .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #pnruhcqurs .gt_column_spanner_outer:first-child { padding-left: 0; } #pnruhcqurs .gt_column_spanner_outer:last-child { padding-right: 0; } #pnruhcqurs .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #pnruhcqurs .gt_spanner_row { border-bottom-style: hidden; } #pnruhcqurs .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #pnruhcqurs .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #pnruhcqurs .gt_from_md > :first-child { margin-top: 0; } #pnruhcqurs .gt_from_md > :last-child { margin-bottom: 0; } #pnruhcqurs .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #pnruhcqurs .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #pnruhcqurs .gt_row_group_first td { border-top-width: 2px; } #pnruhcqurs .gt_row_group_first th { border-top-width: 2px; } #pnruhcqurs .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #pnruhcqurs .gt_first_summary_row.thick { border-top-width: 2px; } #pnruhcqurs .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #pnruhcqurs .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #pnruhcqurs .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_left { text-align: left; } #pnruhcqurs .gt_center { text-align: center; } #pnruhcqurs .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #pnruhcqurs .gt_font_normal { font-weight: normal; } #pnruhcqurs .gt_font_bold { font-weight: bold; } #pnruhcqurs .gt_font_italic { font-style: italic; } #pnruhcqurs .gt_super { font-size: 65%; } #pnruhcqurs .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #pnruhcqurs .gt_asterisk { font-size: 100%; vertical-align: 0; } #pnruhcqurs .gt_indent_1 { text-indent: 5px; } #pnruhcqurs .gt_indent_2 { text-indent: 10px; } #pnruhcqurs .gt_indent_3 { text-indent: 15px; } #pnruhcqurs .gt_indent_4 { text-indent: 20px; } #pnruhcqurs .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 Trust in Government, 2020 % s.e. (%) Always 1.6% 0.2% Most of the time 13.2% 0.6% About half the time 30.9% 0.8% Some of the time 43.4% 0.9% Never 11.0% 0.6% American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 8.3.1.1 gtsummary The {gtsummary} package is a package that simultaneously summarizes data and creates publication-ready tables. Its origins are in clinical trial data but has been extended to include survey analysis in some limited ways. At this time, it only works with survey objects using Taylor’s Series Linearization and not replicate methods. A limited set of summary statistics are available. For categorical variable,s the following summary statistics are available: {n} frequency {N} denominator, or cohort size {p} percentage {p.std.error} standard error of the sample proportion computed with [survey::svymean()] {deff} design effect of the sample proportion computed with [survey::svymean()] {n_unweighted} unweighted frequency {N_unweighted} unweighted denominator {p_unweighted} unweighted formatted percentage For continuous variables, the following summary statistics are available: {median} median {mean} mean {mean.std.error} standard error of the sample mean computed with [survey::svymean()] {deff} design effect of the sample mean computed with [survey::svymean()] {sd} standard deviation {var} variance {min} minimum {max} maximum {p##} any integer percentile, where ## is an integer from 0 to 100 {sum} sum In the following example, we will build up a table using {gtsummary} which will be similar to the table in the {gt} example. The main function used is tbl_svysummary(). In this function, the variables we want to analyze are included in the include argument and the statistics we want to display are in the statistic argument. To specify statistics, the syntax from the {glue} package are used where variables you want to insert is included inside curly brackets. To specify that we want, the proportion followed by the standard error of the proportion in parentheses, we use “{p} ({p.std.error})”. You must specify the statistics you want using the names of the statistics in the two lists above. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;) ) #gxskozkogi table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #gxskozkogi thead, #gxskozkogi tbody, #gxskozkogi tfoot, #gxskozkogi tr, #gxskozkogi td, #gxskozkogi th { border-style: none; } #gxskozkogi p { margin: 0; padding: 0; } #gxskozkogi .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #gxskozkogi .gt_caption { padding-top: 4px; padding-bottom: 4px; } #gxskozkogi .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #gxskozkogi .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #gxskozkogi .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #gxskozkogi .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #gxskozkogi .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #gxskozkogi .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #gxskozkogi .gt_column_spanner_outer:first-child { padding-left: 0; } #gxskozkogi .gt_column_spanner_outer:last-child { padding-right: 0; } #gxskozkogi .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #gxskozkogi .gt_spanner_row { border-bottom-style: hidden; } #gxskozkogi .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #gxskozkogi .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #gxskozkogi .gt_from_md > :first-child { margin-top: 0; } #gxskozkogi .gt_from_md > :last-child { margin-bottom: 0; } #gxskozkogi .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #gxskozkogi .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #gxskozkogi .gt_row_group_first td { border-top-width: 2px; } #gxskozkogi .gt_row_group_first th { border-top-width: 2px; } #gxskozkogi .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #gxskozkogi .gt_first_summary_row.thick { border-top-width: 2px; } #gxskozkogi .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #gxskozkogi .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #gxskozkogi .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #gxskozkogi .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #gxskozkogi .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_left { text-align: left; } #gxskozkogi .gt_center { text-align: center; } #gxskozkogi .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #gxskozkogi .gt_font_normal { font-weight: normal; } #gxskozkogi .gt_font_bold { font-weight: bold; } #gxskozkogi .gt_font_italic { font-style: italic; } #gxskozkogi .gt_super { font-size: 65%; } #gxskozkogi .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #gxskozkogi .gt_asterisk { font-size: 100%; vertical-align: 0; } #gxskozkogi .gt_indent_1 { text-indent: 5px; } #gxskozkogi .gt_indent_2 { text-indent: 10px; } #gxskozkogi .gt_indent_3 { text-indent: 15px; } #gxskozkogi .gt_indent_4 { text-indent: 20px; } #gxskozkogi .gt_indent_5 { text-indent: 25px; } Characteristic N = 231,592,6931 TrustGovernment     Always 1.6 (0.00)     Most of the time 13 (0.01)     About half the time 31 (0.01)     Some of the time 43 (0.01)     Never 11 (0.01)     Unknown 675,402 1 % (SE(%)) In this default table, the weighted number of missing (or Unknown) records is included. Additionally, the standard error is reported as a proportion while the proportion is styled as a percentage. In the next step, we remove the Unknown category by setting the missing argument to “no” and format the standard error as a percentage within the digits argument. Finally, we label the “TrustGovernment” variable to something more publication-ready using the label argument. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) #bkckwmzsjh table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #bkckwmzsjh thead, #bkckwmzsjh tbody, #bkckwmzsjh tfoot, #bkckwmzsjh tr, #bkckwmzsjh td, #bkckwmzsjh th { border-style: none; } #bkckwmzsjh p { margin: 0; padding: 0; } #bkckwmzsjh .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #bkckwmzsjh .gt_caption { padding-top: 4px; padding-bottom: 4px; } #bkckwmzsjh .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #bkckwmzsjh .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #bkckwmzsjh .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #bkckwmzsjh .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #bkckwmzsjh .gt_column_spanner_outer:first-child { padding-left: 0; } #bkckwmzsjh .gt_column_spanner_outer:last-child { padding-right: 0; } #bkckwmzsjh .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #bkckwmzsjh .gt_spanner_row { border-bottom-style: hidden; } #bkckwmzsjh .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #bkckwmzsjh .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #bkckwmzsjh .gt_from_md > :first-child { margin-top: 0; } #bkckwmzsjh .gt_from_md > :last-child { margin-bottom: 0; } #bkckwmzsjh .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #bkckwmzsjh .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #bkckwmzsjh .gt_row_group_first td { border-top-width: 2px; } #bkckwmzsjh .gt_row_group_first th { border-top-width: 2px; } #bkckwmzsjh .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #bkckwmzsjh .gt_first_summary_row.thick { border-top-width: 2px; } #bkckwmzsjh .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #bkckwmzsjh .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #bkckwmzsjh .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_left { text-align: left; } #bkckwmzsjh .gt_center { text-align: center; } #bkckwmzsjh .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #bkckwmzsjh .gt_font_normal { font-weight: normal; } #bkckwmzsjh .gt_font_bold { font-weight: bold; } #bkckwmzsjh .gt_font_italic { font-style: italic; } #bkckwmzsjh .gt_super { font-size: 65%; } #bkckwmzsjh .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #bkckwmzsjh .gt_asterisk { font-size: 100%; vertical-align: 0; } #bkckwmzsjh .gt_indent_1 { text-indent: 5px; } #bkckwmzsjh .gt_indent_2 { text-indent: 10px; } #bkckwmzsjh .gt_indent_3 { text-indent: 15px; } #bkckwmzsjh .gt_indent_4 { text-indent: 20px; } #bkckwmzsjh .gt_indent_5 { text-indent: 25px; } Characteristic N = 231,592,6931 Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) 1 % (SE(%)) To remove the phrase “Characteristic” and the estimated population size, we can modify the header using the function modify_header() to update the label and stat_0. To add footnotes and a title, we can first convert the object to a gt table using as_gt() and then use the same functions we did previously. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) %&gt;% modify_footnote(update = everything() ~ NA) %&gt;% modify_header( label = &quot; &quot;, stat_0 = &quot;% (s.e.)&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #rwokkyhyau table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #rwokkyhyau thead, #rwokkyhyau tbody, #rwokkyhyau tfoot, #rwokkyhyau tr, #rwokkyhyau td, #rwokkyhyau th { border-style: none; } #rwokkyhyau p { margin: 0; padding: 0; } #rwokkyhyau .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #rwokkyhyau .gt_caption { padding-top: 4px; padding-bottom: 4px; } #rwokkyhyau .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #rwokkyhyau .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #rwokkyhyau .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #rwokkyhyau .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #rwokkyhyau .gt_column_spanner_outer:first-child { padding-left: 0; } #rwokkyhyau .gt_column_spanner_outer:last-child { padding-right: 0; } #rwokkyhyau .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #rwokkyhyau .gt_spanner_row { border-bottom-style: hidden; } #rwokkyhyau .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #rwokkyhyau .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #rwokkyhyau .gt_from_md > :first-child { margin-top: 0; } #rwokkyhyau .gt_from_md > :last-child { margin-bottom: 0; } #rwokkyhyau .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #rwokkyhyau .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #rwokkyhyau .gt_row_group_first td { border-top-width: 2px; } #rwokkyhyau .gt_row_group_first th { border-top-width: 2px; } #rwokkyhyau .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #rwokkyhyau .gt_first_summary_row.thick { border-top-width: 2px; } #rwokkyhyau .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #rwokkyhyau .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #rwokkyhyau .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_left { text-align: left; } #rwokkyhyau .gt_center { text-align: center; } #rwokkyhyau .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #rwokkyhyau .gt_font_normal { font-weight: normal; } #rwokkyhyau .gt_font_bold { font-weight: bold; } #rwokkyhyau .gt_font_italic { font-style: italic; } #rwokkyhyau .gt_super { font-size: 65%; } #rwokkyhyau .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #rwokkyhyau .gt_asterisk { font-size: 100%; vertical-align: 0; } #rwokkyhyau .gt_indent_1 { text-indent: 5px; } #rwokkyhyau .gt_indent_2 { text-indent: 10px; } #rwokkyhyau .gt_indent_3 { text-indent: 15px; } #rwokkyhyau .gt_indent_4 { text-indent: 20px; } #rwokkyhyau .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 % (s.e.) Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? Continuous variables can also be added and we add a summary of the age variable to the table below by updating the include, statistic, and digits argument. Adding on additional variables is a large benefit to the {gtsummary} package. anes_des %&gt;% tbl_svysummary( include=c(TrustGovernment, Age), statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;, all_continuous() ~ &quot;{mean} ({mean.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent, Age ~ c(1, 2)), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) %&gt;% modify_header( label = &quot; &quot;, stat_0 = &quot;Summary&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #qkclffrjqx table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qkclffrjqx thead, #qkclffrjqx tbody, #qkclffrjqx tfoot, #qkclffrjqx tr, #qkclffrjqx td, #qkclffrjqx th { border-style: none; } #qkclffrjqx p { margin: 0; padding: 0; } #qkclffrjqx .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qkclffrjqx .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qkclffrjqx .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qkclffrjqx .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qkclffrjqx .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qkclffrjqx .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qkclffrjqx .gt_column_spanner_outer:first-child { padding-left: 0; } #qkclffrjqx .gt_column_spanner_outer:last-child { padding-right: 0; } #qkclffrjqx .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qkclffrjqx .gt_spanner_row { border-bottom-style: hidden; } #qkclffrjqx .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qkclffrjqx .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qkclffrjqx .gt_from_md > :first-child { margin-top: 0; } #qkclffrjqx .gt_from_md > :last-child { margin-bottom: 0; } #qkclffrjqx .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qkclffrjqx .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qkclffrjqx .gt_row_group_first td { border-top-width: 2px; } #qkclffrjqx .gt_row_group_first th { border-top-width: 2px; } #qkclffrjqx .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qkclffrjqx .gt_first_summary_row.thick { border-top-width: 2px; } #qkclffrjqx .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qkclffrjqx .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qkclffrjqx .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_left { text-align: left; } #qkclffrjqx .gt_center { text-align: center; } #qkclffrjqx .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qkclffrjqx .gt_font_normal { font-weight: normal; } #qkclffrjqx .gt_font_bold { font-weight: bold; } #qkclffrjqx .gt_font_italic { font-style: italic; } #qkclffrjqx .gt_super { font-size: 65%; } #qkclffrjqx .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qkclffrjqx .gt_asterisk { font-size: 100%; vertical-align: 0; } #qkclffrjqx .gt_indent_1 { text-indent: 5px; } #qkclffrjqx .gt_indent_2 { text-indent: 10px; } #qkclffrjqx .gt_indent_3 { text-indent: 15px; } #qkclffrjqx .gt_indent_4 { text-indent: 20px; } #qkclffrjqx .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 Summary1 Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) Age 47.3 (0.36) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 1 % (SE(%)); Mean (SE) The {gtsummary} also allows calculating statistics by different groups easily. Let’s adapt the prior example to perform analysis by whether the person voted for president in 2020. The argument for by is updated and the header names are updated. Finally, we update the header. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;), by = VotedPres2020 ) %&gt;% modify_header( label = &quot; &quot;, stat_1 = &quot;Voted&quot;, stat_2 = &quot;Didn&#39;t vote&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government by whether they voted in the 2020 presidential election&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) ## 7 observations missing `VotedPres2020` have been removed. To include these observations, use `forcats::fct_na_value_to_level()` on `VotedPres2020` column before passing to `tbl_svysummary()`. #qossclpplr table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qossclpplr thead, #qossclpplr tbody, #qossclpplr tfoot, #qossclpplr tr, #qossclpplr td, #qossclpplr th { border-style: none; } #qossclpplr p { margin: 0; padding: 0; } #qossclpplr .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qossclpplr .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qossclpplr .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qossclpplr .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qossclpplr .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qossclpplr .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qossclpplr .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qossclpplr .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qossclpplr .gt_column_spanner_outer:first-child { padding-left: 0; } #qossclpplr .gt_column_spanner_outer:last-child { padding-right: 0; } #qossclpplr .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qossclpplr .gt_spanner_row { border-bottom-style: hidden; } #qossclpplr .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qossclpplr .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qossclpplr .gt_from_md > :first-child { margin-top: 0; } #qossclpplr .gt_from_md > :last-child { margin-bottom: 0; } #qossclpplr .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qossclpplr .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qossclpplr .gt_row_group_first td { border-top-width: 2px; } #qossclpplr .gt_row_group_first th { border-top-width: 2px; } #qossclpplr .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qossclpplr .gt_first_summary_row.thick { border-top-width: 2px; } #qossclpplr .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qossclpplr .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qossclpplr .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qossclpplr .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qossclpplr .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_left { text-align: left; } #qossclpplr .gt_center { text-align: center; } #qossclpplr .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qossclpplr .gt_font_normal { font-weight: normal; } #qossclpplr .gt_font_bold { font-weight: bold; } #qossclpplr .gt_font_italic { font-style: italic; } #qossclpplr .gt_super { font-size: 65%; } #qossclpplr .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qossclpplr .gt_asterisk { font-size: 100%; vertical-align: 0; } #qossclpplr .gt_indent_1 { text-indent: 5px; } #qossclpplr .gt_indent_2 { text-indent: 10px; } #qossclpplr .gt_indent_3 { text-indent: 15px; } #qossclpplr .gt_indent_4 { text-indent: 20px; } #qossclpplr .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government by whether they voted in the 2020 presidential election Voted1 Didn’t vote1 Trust in Government, 2020     Always 1.1 (0.2) 3.2 (0.7)     Most of the time 14 (0.6) 12 (1.4)     About half the time 32 (0.8) 29 (1.8)     Some of the time 45 (0.9) 39 (1.9)     Never 9.0 (0.6) 18 (1.7) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 1 % (SE(%)) 8.3.2 Charts and Plots Survey analysis can result in an abundance of printed summary statistics and models. Even with the best analysis, the results can be overwhelming and difficult to comprehend. This is where charts and plots play a key role in our work. By transforming complex data into visual representation, we can recognize patterns, relationships, and trends with greater ease. R has many packages for creating compelling and insightful charts. We will focus on {ggplot2}, a member of the {tidyverse}. This package is a powerful, flexible tool for creating a wide range of data visualization. {ggplot2} follows the “grammar of graphics,” a framework that incrementally adds layers of chart components. We can customize visual elements such as scales, colors, labels, and annotations to enhance the understanding of data. After creating the design object that we’ve been using previously, we select our desired data points by modifying the existing design to add other outcomes and calculate estimates. Below, we create a binary variable TrustGovernmentUsually which is TRUE when TrustGovernment is “Always” or “Most of the time” and FALSE otherwise. Then, we calculate the percentage of people who usually trust in the government by who they voted for in the 2020 presidential election (VotedPres2020_selection). We remove the cases where people did not vote or did not indicate for whom they voted. anes_des_der &lt;- anes_des %&gt;% mutate(TrustGovernmentUsually = case_when( is.na(TrustGovernment) ~ NA, TRUE ~ TrustGovernment %in% c(&quot;Always&quot;, &quot;Most of the time&quot;) )) %&gt;% group_by(VotedPres2020_selection) %&gt;% summarise( pct_trust = survey_mean( TrustGovernmentUsually, na.rm = TRUE, proportion = TRUE, vartype = &quot;ci&quot; ), .groups = &quot;drop&quot; ) %&gt;% drop_na(VotedPres2020_selection) anes_des_der ## # A tibble: 3 × 4 ## VotedPres2020_selection pct_trust pct_trust_low pct_trust_upp ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Biden 0.124 0.108 0.141 ## 2 Trump 0.173 0.155 0.193 ## 3 Other 0.0614 0.0249 0.144 Now, we can begin creating our chart with {ggplot2}. First, we set up our plot with ggplot(). Next, we state the data points we want to show with aes. Finally, we specify the type of plot with geom_*(), in this case, geom_bar(). p &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust)) + geom_bar(stat = &quot;identity&quot;) p FIGURE 8.1: Bar chart of trust in government by chosen 2020 presidential candidate ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10 and 0.15. ## The chart is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12. ## Bar 2 is centered at 2, and length is from 0 to 0.17. ## Bar 3 is centered at 3, and length is from 0 to 0.06. This is a great starting-off point: we can see that the percentage of people saying they always usually trust the government is higher for those who voted for Trump than Biden or other candidates. What if we wanted to add color to better differentiate the three groups? We can add fill under aesthetics, denoting to use those data points to fill in the bars. pcolor &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) pcolor FIGURE 8.2: Bar chart of trust in government by chosen 2020 presidential candidate with colors ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10 and 0.15. ## There is a legend indicating fill is used to show VotedPres2020_selection, with 3 levels: ## Biden shown as strong reddish orange fill, ## Trump shown as vivid yellowish green fill and ## Other shown as brilliant blue fill. ## The chart is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour strong reddish orange which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour vivid yellowish green which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour brilliant blue which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. Let’s say we wanted to follow good statistical analysis practice and include our ranges on our plot. We can add another geom, geom_errorbar(), on top of our geom_bar() layer using a plus sign +. pcol_error &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) pcol_error FIGURE 8.3: Bar chart of trust in government by chosen 2020 presidential candidate with colors and error bars ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10, 0.15 and 0.20. ## There is a legend indicating fill is used to show VotedPres2020_selection, with 3 levels: ## Biden shown as strong reddish orange fill, ## Trump shown as vivid yellowish green fill and ## Other shown as brilliant blue fill. ## It has 2 layers. ## Layer 1 is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour strong reddish orange which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour vivid yellowish green which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour brilliant blue which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. ## Layer 2 is an errorbar graph that VI cannot process. ## Layer 2 has width set to 0.2. We can continue adding to our plot until we achieve the visualization we’d like to present. pfull &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) + scale_fill_manual(values = c(&quot;#0b3954&quot;, &quot;#bfd7ea&quot;, &quot;#8d6b94&quot;)) + xlab(&quot;Election choice (2020)&quot;) + ylab(&quot;Usually trust the government&quot;) + scale_y_continuous(labels = scales::percent) + guides(fill = &quot;none&quot;) + labs(title = &quot;Percent of voters who usually trust the government by chosen 2020 presidential candidate&quot;, caption = &quot;Source: American National Election Studies, 2020&quot;) pfull FIGURE 8.4: Bar chart of trust in government by chosen 2020 presidential candidate with colors, labels, error bars, and title ## This chart has title &#39;Percent of voters who usually trust the government by chosen 2020 presidential candidate&#39;. ## It has caption &#39;Source: American National Election Studies, 2020&#39;. ## It has x-axis &#39;Election choice (2020)&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;Usually trust the government&#39; with labels 0%, 5%, 10%, 15% and 20%. ## In this chart fill is used to show VotedPres2020_selection. The legend that would normally indicate this has been hidden. ## It has 2 layers. ## Layer 1 is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour dark blue which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour very pale blue which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour moderate purple which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. ## Layer 2 is an errorbar graph that VI cannot process. ## Layer 2 has width set to 0.2. 8.4 Reproducibility Reproducibility is the ability to recreate or replicate the results of a data analysis. If we pass an analysis project to another person, they should be able to run the entire project from start to finish and obtain the same results. Reproducibility is a crucial aspect of survey research because it enables the verification of findings and ensures that the conclusions are not dependent on a particular person running the workflow. Others can review and rerun projects to build on existing work, reducing redundancy and errors. Reproducibility requires that we consider several key components: Code: The source code used for data cleaning, analysis, modeling, and reporting must be available, discoverable, documented, and shared. Data: The raw data used in the workflow must be available, discoverable, documented, and shared. If the raw data is sensitive or proprietary, we must be able to provide the data that would allow others to run our workflow. Environment: The environment of the project must be documented. Another analyst should be able to recreate the environment, including the R version, packages, operating system, and other dependencies used in the analysis. Methodology: The analysis methodology, including the rationale behind specific decisions, interpretations, and assumptions, must be documented. Others should be able to achieve the same analysis results based on the methodology report. Many tools, practices, and project management techniques exist to make survey analysis projects easy to reproduce. For best results, they should be decided upon and applied at the beginning of a project. Below are our suggestions for a survey analysis data workflow. This list is not comprehensive but aims to provide a starting off point for teams looking to create a reproducible workflow. 8.4.1 Setting Random Number Seeds Some tasks in survey analysis require randomness such as imputation, model training, or creating random samples. By default, the random numbers generated by R will change each time we rerun the code, making it difficult to reproduce the same results. By “setting the seed,” we can control the randomness and ensure that the random numbers remain consistent whenever we rerun the code. Others can use the same seed value to reproduce our random numbers and achieve the same results, facilitating reproducibility. In R, we can use the set.seed() function to control the randomness in our code. Set a seed value by providing an integer to the function: set.seed(999) runif(5) The runif() function generates five random numbers from a uniform distribution. Since the seed is set to 999, running runif() multiple times will always produce the same sequence: [1] 0.38907138 0.58306072 0.09466569 0.85263123 0.78674676 It is important to note that set.seed() should be used before any random number generation. 8.4.2 Git A survey analysis project produces a lot of code. As code evolves throughout a project, keeping track of the latest version becomes challenging. If a team of analysts is working on the same script, someone may use an outdated version, resulting in incorrect results or duplicative work. Version control systems like Git can help alleviate these pains. Git is a system that helps track changes in computer files. Survey analysis can use Git to follow the evolution of code and manage asynchronous work. With Git, it is easy to see any changes made in a script, revert changes, and resolve conflicts between versions. Services such as GitHub or GitLab provide hosting and sharing of files as well as version control with Git. For example, we can visit the GitHub repository for this book (https://github.com/tidy-survey-r/tidy-survey-book) and see the files that build the book, when they were committed to the repository, and the history of modifications over time. In addition to code scripts, platforms like GitHub can store data and documentation. They provide a way to maintain a history of data modifications through versioning and timestamps. By saving the data and documentation alongside the code, it becomes easier for others to refer to and access everything they need in one place. Using version control in data science projects makes collaboration and maintenance more manageable. One excellent resource is Happy Git and GitHub for the R useR by Jenny Bryan and Jim Hester. 8.4.3 {renv} The {renv} package is a popular option for managing dependencies and creating virtual environments in R. It creates isolated, project-specific environments that record the packages and their versions used in the code. When initiated, {renv} checks whether the installed packages are consistent with the record. If not, it restores the correct versions for running the project. With {renv}, others can replicate the project’s environment to rerun the code and obtain consistent results. 8.4.4 Quarto/R Markdown Quarto and R Markdown are powerful tools that allow us to create documents that combine code and text. These documents present analysis results alongside the report’s narrative, so there’s no need to copy and paste code output into the final documentation. By eliminating manual steps, we can reduce the chances of errors in the final output. Rerunning a Quarto or R Markdown document automatically re-executes the underlying code. Another team member can recreate the report and obtain the same results. Parameterization Quarto and R Markdown’s parameterization is an important aspect of reproducibility in reporting. Parameters can control various aspects of the analysis, such as dates, geography, or other analysis variables. By parameterizing our code, we can define and modify these parameters to explore different scenarios or inputs. For example, we can create a document that provides survey analysis results for Michigan. By defining a state parameter, we can rerun the same analysis for Wisconsin without having to edit the code throughout the document. We can define parameterization in the header or code chunks of our Quarto/R Markdown documents. Again, we can easily modify and document the values of these parameters, reducing errors that may occur by manually editing code throughout the script. Parameterization is also a flexible way for others to replicate the analysis and explore variations. 8.4.5 The {targets} package The {targets} package is a workflow manager enabling us to document, automate, and execute complex data workflows with multiple steps and dependencies. We define the order of execution for our code. Only the affected code and its downstream targets are re-executed when we change a script. The {targets} package also provides interactive progress monitoring and reporting, allowing us to track the status and progress of our analysis pipeline. This tool helps with reproducibility by tracking dependencies, inputs, and outputs of each step of our workflow. As noted above, many tools, practices, and project management techniques exist for achieving reproducibility. Most critical is deciding on reproducibility goals with our team and the requirements to achieve them before deciding on workflow and documentation. "],["c09-ncvs-vignette.html", "Chapter 9 National Crime Victimization Survey Vignette 9.1 Introduction 9.2 Data structure 9.3 Survey notation 9.4 Data file preparation 9.5 Survey design objects 9.6 Calculating estimates 9.7 Exercises", " Chapter 9 National Crime Victimization Survey Vignette Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(srvyr) library(gt) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from NCVS. Here is the code to read in the three datasets that we will be working with: inc_in &lt;- read_osf(&quot;ncvs_2021_incident.rds&quot;) hh_in &lt;- read_osf(&quot;ncvs_2021_household.rds&quot;) pers_in &lt;- read_osf(&quot;ncvs_2021_person.rds&quot;) 9.1 Introduction The United States National Crime Victimization Survey (NCVS) is a household survey sponsored by the Bureau of Justice Statistics (BJS), which collects data on criminal victimization, including characteristics of the crimes, offenders, and victims. Both household and personal crimes include violent and non-violent crimes. The target population of this survey is all people in the United States age 12 and older living in housing units and noninstitutional group quarters. The NCVS has been ongoing since 1992. An earlier survey, the National Crime Survey, was run from 1972 to 1991 (Bureau of Justice Statistics (2017)). The survey is administered using a rotating panel. When an address enters the sample, the residents of that address are interviewed every six months for a total of seven interviews. If the initial residents move away from the address during the period, the new residents are included in the survey, and people are not followed when they move. NCVS data is publicly available and distributed by Inter-university Consortium for Political and Social Research (ICPSR)28, with data going back to 1992. The vignette in this book will include data from 2021 (United States. Bureau of Justice Statistics (2022)). The NCVS data structure is complicated, and the User’s Guide contains examples for analysis in SAS, SUDAAN, SPSS, and Stata, but not R (Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (2015)). This vignette will adapt those examples for R. 9.2 Data structure The data from ICPSR is distributed with five files, each having its unique identifier indicated: Address Record - YEARQ, IDHH Household Record - YEARQ, IDHH Person Record - YEARQ, IDHH, IDPER Incident Record - YEARQ, IDHH, IDPER 2021 Collection Year Incident - YEARQ, IDHH, IDPER We will focus on the household, person, and incident files. From these files, we selected a subset of columns for examples to use in this vignette. Download the complete files at ICPSR29. 9.3 Survey notation The NCVS User Guide (Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (2015)) uses the following notation: \\(i\\) represents NCVVS households, identified on the household-level file on the basis of the household identification number IDHH. \\(j\\) represents NCVS individual respondents within households \\(i\\), identified on the person-level file on the basis of the person identification number IDPER. \\(k\\) represents reporting periods (i.e., YEARQ) for households \\(i\\) and individual respondent \\(j\\). \\(l\\) represents victimization records for respondent \\(j\\) in household \\(i\\) and reporting period \\(k\\). Each record on the NCVS incident-level file is associated with a victimization record \\(l\\). \\(D\\) represents one or more domain characteristics of interest in the calculation of NCVS estimates. For victimization totals and proportions, domains can be defined on the basis of crime types (e.g., violent crimes, property crimes), characteristics of victims (e.g., age, sex, household income), or characteristics of the victimizations (e.g., victimizations reported to police, victimizations committed with a weapon present). Domains could also be a combination of all of these types of characteristics. For the calculation of victimization rates, domains are defined on the basis of the characteristics of the victims. \\(A_a\\) represents the level \\(a\\) of covariate \\(A\\). Covariate \\(A\\) is defined in the calculation of victimization proportions and represents the characteristic for which the analyst wants to obtain the distribution of victimizations in domain \\(D\\). \\(C\\) represents the personal or property crime for which we want to obtain a victimization rate. In this vignette, we will discuss four estimates: Victimization totals estimate the number of criminal victimizations with a given characteristic. As demonstrated below, these can be calculated from any of the design objects. The estimated victimization total, \\(\\hat{t}_D\\) for domain \\(D\\) is estimated as \\[ \\hat{t}_D = \\sum_{ijkl \\in D} v_{ijkl}\\] where \\(v_{ijkl}\\) is the series-adjusted victimization weight for household \\(i\\), respondent \\(j\\), reporting period \\(k\\), and victimization \\(l\\), that is WGTVICCY. Victimization proportions estimate characteristics among victimizations or victims. Victimization proportions are calculated using the incident design object. The estimated victimization proportion for domain \\(D\\) across level \\(a\\) of covariate \\(A\\), \\(\\hat{p}_{A_a,D}\\) is \\[ \\hat{p}_{A_a,D} =\\frac{\\sum_{ijkl \\in A_a, D} v_{ijkl}}{\\sum_{ijkl \\in D} v_{ijkl}}.\\] The numerator is the number of incidents with a particular characteristic in a domain, and the denominator is the number of incidents in a domain. Victimization rates are estimates of the number of victimizations per 1,000 persons or households in the population30. Victimization rates are calculated using the household or person design objects. The estimated victimization rate for crime \\(C\\) in domain \\(D\\) is \\[V\\hat{R}_{C,D}= \\frac{\\sum_{ijkl \\in C,D} v_{ijkl}}{\\sum_{ijk \\in D} w_{ijk}}\\times 1000\\] where \\(w_{ijk}\\) is the person weight (WGTPERCY) or household weight (WGTHHCY) for personal and household crimes, respectively. The numerator is the number of incidents in a domain, and the denominator is the number of persons or households in a domain. Prevalence rates are estimates of the percentage of the population (persons or households) who are victims of a crime. These are estimated using the household or person design objects. The estimated prevalence rate for crime \\(C\\) in domain \\(D\\) is \\[ P\\hat{R}_{C, D}= \\frac{\\sum_{ijk \\in {C,D}} I_{ij}w_{ijk}}{\\sum_{ijk \\in D} w_{ijk}} \\times 100\\] where \\(I_{ij}\\) is an indicator that a person or household in domain \\(D\\) was a victim of crime \\(C\\) at any time in the year. The numerator is the number of people who are victims in domain \\(D\\) for crime \\(C\\), and the denominator is the number of people or households in the population. 9.4 Data file preparation Some work is necessary to prepare the files before analysis. The design variables indicating pseudostratum (V2117) and half-sample code (V2118) are only included on the household file, so they must be added to the person and incident files for any analysis. For victimization rates, we need to know the victimization status for both victims and non-victims. The incident file must be summarized and merged onto the household or person files for household-level and person-level crimes, respectively. We begin this vignette by discussing how to create these incident summary files. This is following Section 2.2 of the NCVS User’s Guide. 9.4.1 Preparing files for estimation of victimization rates Each record on the incident file represents one victimization, which is not the same as one incident. Some victimizations have several instances that make it difficult for the victim to differentiate the details of these incidents, labeled as “series crimes”. Appendix A of the User’s Guide indicates how to calculate the series weight in other statistical languages. Here, we adapt that code adapted for R. Essentially, if a victimization is a series crime, its series weight is top-coded at 10 based on the number of actual victimizations. If an incident is a series crime, but the number of occurrences is unknown, the series weight is set to 6. A description of the variables used in this chunk is included in the following table: Description Value Label V4016 HOW MANY TIMES INCIDENT OCCUR LAST 6 MOS 1-996 Number of times 997 Don’t know V4017 HOW MANY INCIDENTS 1 1-5 incidents (not a “series”) 2 6 or more incidents V4018 INCIDENTS SIMILAR IN DETAIL 1 Similar 2 Different (not in a “series”) V4019 ENOUGH DETAIL TO DISTINGUISH INCIDENTS 1 Yes (not a “series”) 2 No (is a “series”) WGTVICCY ADJUSTED VICTIMIZATION WEIGHT Numeric inc_series &lt;- inc_in %&gt;% mutate( series = case_when( V4017 %in% c(1, 8) ~ 1, V4018 %in% c(2, 8) ~ 1, V4019 %in% c(1, 8) ~ 1, TRUE ~ 2 # series ), n10v4016 = case_when(V4016 %in% c(997, 998) ~ NA_real_, V4016 &gt; 10 ~ 10, TRUE ~ V4016), serieswgt = case_when(series == 2 &amp; is.na(n10v4016) ~ 6, series == 2 ~ n10v4016, TRUE ~ 1), NEWWGT = WGTVICCY * serieswgt ) The next step in preparing the files for estimation is to create indicators on the victimization file for characteristics of interest. Almost all BJS publications limit the analysis to records where the victimization occurred in the United States, where V4022 is not equal to 1, and we will do this for all estimates as well. In the following example, we will create the following indicators: Property crime V4529 &gt;= 31 Variable: Property Violent crime V4529 &lt;= 20 Variable: Violent Property crime reported to the police V4529 &gt;= 31 and V4399=1 Variable: Property_ReportPolice Violent crime reported to the police V4529 &lt; 31 and V4399=1 Variable: Violent_ReportPolice Aggravated assault without a weapon V4529 in 11:12 and V4049=2 Variable: AAST_NoWeap Aggravated assault with a firearm V4529 in 11:12 and V4049=1 and (V4051=1 or V4052=1 or V4050=7) Variable: AAST_Firearm Aggravated assault with a knife or sharp object V4529 in 11:12 and V4049=1 and (V4053=1 or V4054=1) Variable: AAST_Knife Aggravated assault with another type of weapon V4529 in 11:12 and V4049=1 and V4050=1 and not firearm or knife Variable: AAST_Other A brief codebook for this section is as follows: Variable Description Value Label V4022 IN WHAT CITY, TOWN, VILLAGE 1 Outside U.S. 2 Not inside a city/town/village 3 Same city/town/village as present residence 4 Different city/town/village as present residence 5 Don’t know 6 Don’t know if 2, 4, or 5 V4049 DID OFFENDER HAVE A WEAPON 1 Yes 2 No 3 Don’t know V4050 WHAT WAS WEAPON 1 At least one good entry 3 Indicates “Yes-Type Weapon-NA” 7 Indicates “Gun Type Unknown” 8 No good entry V4051 HAND GUN 0 No 1 Yes V4052 OTHER GUN 0 No 1 Yes V4053 KNIFE 0 No 1 Yes V4399 REPORTED TO POLICE 1 Yes 2 No 3 Don’t know V4529 TYPE OF CRIME CODE 01 Completed rape 02 Attempted rape 03 Sexual attack with serious assault 04 Sexual attack with minor assault 05 Completed robbery with injury from serious assault 06 Completed robbery with injury from minor assault 07 Completed robbery without injury from minor assault 08 Attempted robbery with injury from serious assault 09 Attempted robbery with injury from minor assault 10 Attempted robbery without injury 11 Completed aggravated assault with injury 12 Attempted aggravated assault with weapon 13 Threatened assault with weapon 14 Simple assault completed with injury 15 Sexual assault without injury 16 Unwanted sexual contact without force 17 Assault without weapon without injury 18 Verbal threat of rape 19 Verbal threat of sexual assault 20 Verbal threat of assault 21 Completed purse snatching 22 Attempted purse snatching 23 Pocket picking (completed only) 31 Completed burglary, forcible entry 32 Completed burglary, unlawful entry without force 33 Attempted forcible entry 40 Completed motor vehicle theft 41 Attempted motor vehicle theft 54 Completed theft less than $10 55 Completed theft $10 to $49 56 Completed theft $50 to $249 57 Completed theft $250 or greater 58 Completed theft value NA 59 Attempted theft Below, we check the derived variables. This is a good point to pause to look at the output of crosswalks between an original variable and a derived one to check the logic was programmed correctly, and everything ends up in the expected bucket. inc_ind &lt;- inc_series %&gt;% filter(V4022 != 1) %&gt;% mutate( WeapCat = case_when( is.na(V4049) ~ NA_character_, V4049 == 2 ~ &quot;NoWeap&quot;, V4049 == 3 ~ &quot;UnkWeapUse&quot;, V4050 == 3 ~ &quot;Other&quot;, V4051 == 1 | V4052 == 1 | V4050 == 7 ~ &quot;Firearm&quot;, V4053 == 1 | V4054 == 1 ~ &quot;Knife&quot;, TRUE ~ &quot;Other&quot; ), V4529_num = parse_number(as.character(V4529)), ReportPolice = V4399 == 1, Property = V4529_num &gt;= 31, Violent = V4529_num &lt;= 20, Property_ReportPolice = Property &amp; ReportPolice, Violent_ReportPolice = Violent &amp; ReportPolice, AAST = V4529_num %in% 11:13, AAST_NoWeap = AAST &amp; WeapCat == &quot;NoWeap&quot;, AAST_Firearm = AAST &amp; WeapCat == &quot;Firearm&quot;, AAST_Knife = AAST &amp; WeapCat == &quot;Knife&quot;, AAST_Other = AAST &amp; WeapCat == &quot;Other&quot; ) # Checking derived variables inc_series %&gt;% count(V4022) ## # A tibble: 6 × 2 ## V4022 n ## &lt;fct&gt; &lt;int&gt; ## 1 1 34 ## 2 2 65 ## 3 3 7697 ## 4 4 1143 ## 5 5 39 ## 6 8 4 inc_ind %&gt;% count(V4022) ## # A tibble: 5 × 2 ## V4022 n ## &lt;fct&gt; &lt;int&gt; ## 1 2 65 ## 2 3 7697 ## 3 4 1143 ## 4 5 39 ## 5 8 4 inc_ind %&gt;% count(WeapCat, V4049, V4050, V4051, V4052, V4052, V4053, V4054) ## # A tibble: 13 × 8 ## WeapCat V4049 V4050 V4051 V4052 V4053 V4054 n ## &lt;chr&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Firearm 1 1 0 1 0 0 15 ## 2 Firearm 1 1 0 1 1 1 1 ## 3 Firearm 1 1 1 0 0 0 125 ## 4 Firearm 1 1 1 0 1 0 2 ## 5 Firearm 1 1 1 1 0 0 3 ## 6 Firearm 1 7 0 0 0 0 3 ## 7 Knife 1 1 0 0 0 1 14 ## 8 Knife 1 1 0 0 1 0 71 ## 9 NoWeap 2 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 1794 ## 10 Other 1 1 0 0 0 0 147 ## 11 Other 1 3 0 0 0 0 26 ## 12 UnkWeapUse 3 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 519 ## 13 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 6228 inc_ind %&gt;% count(V4529, Property, Violent, AAST) %&gt;% print(n = 40) ## # A tibble: 34 × 5 ## V4529 Property Violent AAST n ## &lt;fct&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;int&gt; ## 1 1 FALSE TRUE FALSE 45 ## 2 2 FALSE TRUE FALSE 20 ## 3 3 FALSE TRUE FALSE 11 ## 4 4 FALSE TRUE FALSE 3 ## 5 5 FALSE TRUE FALSE 24 ## 6 6 FALSE TRUE FALSE 26 ## 7 7 FALSE TRUE FALSE 59 ## 8 8 FALSE TRUE FALSE 5 ## 9 9 FALSE TRUE FALSE 7 ## 10 10 FALSE TRUE FALSE 57 ## 11 11 FALSE TRUE TRUE 97 ## 12 12 FALSE TRUE TRUE 91 ## 13 13 FALSE TRUE TRUE 163 ## 14 14 FALSE TRUE FALSE 165 ## 15 15 FALSE TRUE FALSE 24 ## 16 16 FALSE TRUE FALSE 12 ## 17 17 FALSE TRUE FALSE 357 ## 18 18 FALSE TRUE FALSE 14 ## 19 19 FALSE TRUE FALSE 3 ## 20 20 FALSE TRUE FALSE 607 ## 21 21 FALSE FALSE FALSE 2 ## 22 22 FALSE FALSE FALSE 2 ## 23 23 FALSE FALSE FALSE 19 ## 24 31 TRUE FALSE FALSE 248 ## 25 32 TRUE FALSE FALSE 634 ## 26 33 TRUE FALSE FALSE 188 ## 27 40 TRUE FALSE FALSE 256 ## 28 41 TRUE FALSE FALSE 97 ## 29 54 TRUE FALSE FALSE 407 ## 30 55 TRUE FALSE FALSE 1006 ## 31 56 TRUE FALSE FALSE 1686 ## 32 57 TRUE FALSE FALSE 1420 ## 33 58 TRUE FALSE FALSE 798 ## 34 59 TRUE FALSE FALSE 395 inc_ind %&gt;% count(ReportPolice, V4399) ## # A tibble: 4 × 3 ## ReportPolice V4399 n ## &lt;lgl&gt; &lt;fct&gt; &lt;int&gt; ## 1 FALSE 2 5670 ## 2 FALSE 3 103 ## 3 FALSE 8 12 ## 4 TRUE 1 3163 inc_ind %&gt;% count(AAST, WeapCat, AAST_NoWeap, AAST_Firearm, AAST_Knife, AAST_Other) ## # A tibble: 11 × 7 ## AAST WeapCat AAST_NoWeap AAST_Firearm AAST_Knife AAST_Other n ## &lt;lgl&gt; &lt;chr&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;int&gt; ## 1 FALSE Firearm FALSE FALSE FALSE FALSE 34 ## 2 FALSE Knife FALSE FALSE FALSE FALSE 23 ## 3 FALSE NoWeap FALSE FALSE FALSE FALSE 1769 ## 4 FALSE Other FALSE FALSE FALSE FALSE 27 ## 5 FALSE UnkWeapUse FALSE FALSE FALSE FALSE 516 ## 6 FALSE &lt;NA&gt; FALSE FALSE FALSE FALSE 6228 ## 7 TRUE Firearm FALSE TRUE FALSE FALSE 115 ## 8 TRUE Knife FALSE FALSE TRUE FALSE 62 ## 9 TRUE NoWeap TRUE FALSE FALSE FALSE 25 ## 10 TRUE Other FALSE FALSE FALSE TRUE 146 ## 11 TRUE UnkWeapUse FALSE FALSE FALSE FALSE 3 After creating indicators of victimization types and characteristics, the file is summarized, and crimes are summed across persons or households by YEARQ. Property crimes (i.e., crimes committed against households, such as household burglary or motor vehicle theft) are summed across households, and personal crimes (i.e., crimes committed against an individual, such as assault, robbery, and personal theft) are summed across persons. The indicators are summed using the serieswgt, and the variable WGTVICCY needs to be retained for later analysis. inc_hh_sums &lt;- inc_ind %&gt;% filter(V4529_num &gt; 23) %&gt;% # restrict to household crimes group_by(YEARQ, IDHH) %&gt;% summarize( WGTVICCY = WGTVICCY[1], across(starts_with(&quot;Property&quot;), ~ sum(. * serieswgt), .names = &quot;{.col}&quot;), .groups = &quot;drop&quot; ) inc_pers_sums &lt;- inc_ind %&gt;% filter(V4529_num &lt;= 23) %&gt;% # restrict to person crimes group_by(YEARQ, IDHH, IDPER) %&gt;% summarize(WGTVICCY = WGTVICCY[1], across(c( starts_with(&quot;Violent&quot;), starts_with(&quot;AAST&quot;) ), ~ sum(. * serieswgt), .names = &quot;{.col}&quot;), .groups = &quot;drop&quot;) Now, we merge the victimization summary files into the appropriate files. For any record on the household or person file that is not on the victimization file, the victimization counts are set to 0 after merging. In this step, we will also create the victimization adjustment factor. See 2.2.4 in the User’s Guide for details of why this adjustment is created. It is calculated as follows: \\[ A_{ijk}=\\frac{v_{ijk}}{w_{ijk}}\\] where \\(w_{ijk}\\) is the person weight (WGTPERCY) for personal crimes or the household weight (WGTHHCY) for household crimes, and \\(v_{ijk}\\) is the victimization weight (WGTVICCY) for household \\(i\\), respondent \\(j\\), in reporting period \\(k\\). The adjustment factor is set to 0 if no incidents are reported. # Set up a list of 0s for each crime type/characteristic to replace NA&#39;s hh_z_list &lt;- rep(0, ncol(inc_hh_sums) - 3) %&gt;% as.list() %&gt;% setNames(names(inc_hh_sums)[-(1:3)]) pers_z_list &lt;- rep(0, ncol(inc_pers_sums) - 4) %&gt;% as.list() %&gt;% setNames(names(inc_pers_sums)[-(1:4)]) hh_vsum &lt;- hh_in %&gt;% full_join(inc_hh_sums, by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;)) %&gt;% replace_na(hh_z_list) %&gt;% mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTHHCY)) pers_vsum &lt;- pers_in %&gt;% full_join(inc_pers_sums, by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;, &quot;IDPER&quot;)) %&gt;% replace_na(pers_z_list) %&gt;% mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTPERCY)) 9.4.2 Derived demographic variables A final step in file preparation for the household and person files is creating any derived variables on the household and person files, such as income categories or age categories, for subgroup analysis. We can do this step before or after merging the victimization counts. For the household file, we create categories for tenure (rental status), urbanicity, income, place size, and region. For the person file, we create categories for sex, race/Hispanic origin, age categories, and marital status. We also merge the household demographics to the person file as well as the design variables (V2117 and V2118). Brief codebook for household variables: Variable Description Value Label V2015 TENURE 1 Owned or being bought 2 Rented for cash 3 No cash rent SC214A HOUSEHOLD INCOME 01 Less than $5,000 02 $5,000 to $7,499 03 $7,500 to $9,999 04 $10,000 to $12,499 05 $12,500 to $14,999 06 $15,000 to $17,499 07 $17,500 to $19,999 08 $20,000 to $24,999 09 $25,000 to $29,999 10 $30,000 to $34,999 11 $35,000 to $39,999 12 $40,000 to $49,999 13 $50,000 to $74,999 14 $75,000 and over 15 $75,000 to $99,999 16 $100,000-$149,999 17 $150,000-$199,999 18 $200,000 or more V2126B PLACE SIZE CODE 00 Not in a place 13 Under 10,000 16 10,000-49,999 17 50,000-99,999 18 100,000-249,999 19 250,000-499,999 20 500,000-999,999 21 1,000,000-2,499,999 22 2,500,000-4,999,999 23 5,000,000 or more V2127B REGION 1 Northeast 2 Midwest 3 South 4 West V2143 URBANICITY 1 Urban 2 Suburban 3 Rural Brief codebook for person variables: Variable Description Value Label V3014 AGE 12 through 90 V3015 MARITAL STATUS (Current) 1 Married 2 Widowed 3 Divorced 4 Separated 5 Never married V3018 sex 1 Male 2 Female V3023A RACE 01 White only 02 Black only 03 American Indian, Alaska native only 04 Asian only 05 Hawaiian/Pacific Islander only 06 White-Black 07 White-American Indian 08 White-Asian 09 White-Hawaiian 10 Black-American Indian 11 Black-Asian 12 Black-Hawaiian/Pacific Islander 13 American Indian-Asian 14 Asian-Hawaiian/Pacific Islander 15 White-Black-American Indian 16 White-Black-Asian 17 White-American Indian-Asian 18 White-Asian-Hawaiian 19 2 or 3 races 20 4 or 5 races V3024 HISPANIC ORIGIN 1 Yes 2 No hh_vsum_der &lt;- hh_vsum %&gt;% mutate( Tenure = factor( case_when(V2015 == 1 ~ &quot;Owned&quot;, !is.na(V2015) ~ &quot;Rented&quot;), levels = c(&quot;Owned&quot;, &quot;Rented&quot;) ), Urbanicity = factor( case_when(V2143 == 1 ~ &quot;Urban&quot;, V2143 == 2 ~ &quot;Suburban&quot;, V2143 == 3 ~ &quot;Rural&quot;), levels = c(&quot;Urban&quot;, &quot;Suburban&quot;, &quot;Rural&quot;) ), SC214A_num = as.numeric(as.character(SC214A)), Income = case_when( SC214A_num &lt;= 8 ~ &quot;Less than $25,000&quot;, SC214A_num &lt;= 12 ~ &quot;$25,000-49,999&quot;, SC214A_num &lt;= 15 ~ &quot;$50,000-99,999&quot;, SC214A_num &lt;= 17 ~ &quot;$100,000-199,999&quot;, SC214A_num &lt;= 18 ~ &quot;$200,000 or more&quot; ), Income = fct_reorder(Income, SC214A_num, .na_rm = FALSE), PlaceSize = case_match( as.numeric(as.character(V2126B)), 0 ~ &quot;Not in a place&quot;, 13 ~ &quot;Under 10,000&quot;, 16 ~ &quot;10,000-49,999&quot;, 17 ~ &quot;50,000-99,999&quot;, 18 ~ &quot;100,000-249,999&quot;, 19 ~ &quot;250,000-499,999&quot;, 20 ~ &quot;500,000-999,999&quot;, c(21, 22, 23) ~ &quot;1,000,000 or more&quot; ), PlaceSize = fct_reorder(PlaceSize, as.numeric(V2126B)), Region = case_match( as.numeric(V2127B), 1 ~ &quot;Northeast&quot;, 2 ~ &quot;Midwest&quot;, 3 ~ &quot;South&quot;, 4 ~ &quot;West&quot; ), Region = fct_reorder(Region, as.numeric(V2127B)) ) hh_vsum_der %&gt;% count(Tenure, V2015) ## # A tibble: 4 × 3 ## Tenure V2015 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Owned 1 101944 ## 2 Rented 2 46269 ## 3 Rented 3 1925 ## 4 &lt;NA&gt; &lt;NA&gt; 106322 hh_vsum_der %&gt;% count(Urbanicity, V2143) ## # A tibble: 3 × 3 ## Urbanicity V2143 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Urban 1 26878 ## 2 Suburban 2 173491 ## 3 Rural 3 56091 hh_vsum_der %&gt;% count(Income, SC214A) ## # A tibble: 18 × 3 ## Income SC214A n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Less than $25,000 1 7841 ## 2 Less than $25,000 2 2626 ## 3 Less than $25,000 3 3949 ## 4 Less than $25,000 4 5546 ## 5 Less than $25,000 5 5445 ## 6 Less than $25,000 6 4821 ## 7 Less than $25,000 7 5038 ## 8 Less than $25,000 8 11887 ## 9 $25,000-49,999 9 11550 ## 10 $25,000-49,999 10 13689 ## 11 $25,000-49,999 11 13655 ## 12 $25,000-49,999 12 23282 ## 13 $50,000-99,999 13 44601 ## 14 $50,000-99,999 15 33353 ## 15 $100,000-199,999 16 34287 ## 16 $100,000-199,999 17 15317 ## 17 $200,000 or more 18 16892 ## 18 &lt;NA&gt; &lt;NA&gt; 2681 hh_vsum_der %&gt;% count(PlaceSize, V2126B) ## # A tibble: 10 × 3 ## PlaceSize V2126B n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Not in a place 0 69484 ## 2 Under 10,000 13 39873 ## 3 10,000-49,999 16 53002 ## 4 50,000-99,999 17 27205 ## 5 100,000-249,999 18 24461 ## 6 250,000-499,999 19 13111 ## 7 500,000-999,999 20 15194 ## 8 1,000,000 or more 21 6167 ## 9 1,000,000 or more 22 3857 ## 10 1,000,000 or more 23 4106 hh_vsum_der %&gt;% count(Region, V2127B) ## # A tibble: 4 × 3 ## Region V2127B n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Northeast 1 41585 ## 2 Midwest 2 74666 ## 3 South 3 87783 ## 4 West 4 52426 hh_vsum_slim &lt;- hh_vsum_der %&gt;% select(YEARQ:V2118, WGTVICCY:ADJINC_WT, Tenure, Urbanicity, Income, PlaceSize, Region) NHOPI &lt;- &quot;Native Hawaiian or Other Pacific Islander&quot; # made for usage later pers_vsum_der &lt;- pers_vsum %&gt;% mutate( Sex = factor(case_when(V3018 == 1 ~ &quot;Male&quot;, V3018 == 2 ~ &quot;Female&quot;)), RaceHispOrigin = factor( case_when( V3024 == 1 ~ &quot;Hispanic&quot;, V3023A == 1 ~ &quot;White&quot;, V3023A == 2 ~ &quot;Black&quot;, V3023A == 4 ~ &quot;Asian&quot;, V3023A == 5 ~ NHOPI, TRUE ~ &quot;Other&quot; ), levels = c(&quot;White&quot;, &quot;Black&quot;, &quot;Hispanic&quot;, &quot;Asian&quot;, NHOPI, &quot;Other&quot;) ), V3014_num = as.numeric(as.character(V3014)), AgeGroup = case_when( V3014_num &lt;= 17 ~ &quot;12-17&quot;, V3014_num &lt;= 24 ~ &quot;18-24&quot;, V3014_num &lt;= 34 ~ &quot;25-34&quot;, V3014_num &lt;= 49 ~ &quot;35-49&quot;, V3014_num &lt;= 64 ~ &quot;50-64&quot;, V3014_num &lt;= 90 ~ &quot;65 or older&quot;, ), AgeGroup = fct_reorder(AgeGroup, V3014_num), MaritalStatus = factor( case_when( V3015 == 1 ~ &quot;Married&quot;, V3015 == 2 ~ &quot;Widowed&quot;, V3015 == 3 ~ &quot;Divorced&quot;, V3015 == 4 ~ &quot;Separated&quot;, V3015 == 5 ~ &quot;Never married&quot; ), levels = c(&quot;Never married&quot;, &quot;Married&quot;, &quot;Widowed&quot;, &quot;Divorced&quot;, &quot;Separated&quot;) ) ) %&gt;% left_join(select(hh_vsum_slim, YEARQ, IDHH, V2117, V2118, Tenure:Region), by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;)) pers_vsum_der %&gt;% count(Sex, V3018) ## # A tibble: 2 × 3 ## Sex V3018 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Female 2 150956 ## 2 Male 1 140922 pers_vsum_der %&gt;% count(RaceHispOrigin, V3024) ## # A tibble: 11 × 3 ## RaceHispOrigin V3024 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 White 2 197292 ## 2 White 8 883 ## 3 Black 2 29947 ## 4 Black 8 120 ## 5 Hispanic 1 41450 ## 6 Asian 2 16015 ## 7 Asian 8 61 ## 8 Native Hawaiian or Other Pacific Islander 2 891 ## 9 Native Hawaiian or Other Pacific Islander 8 9 ## 10 Other 2 5161 ## 11 Other 8 49 pers_vsum_der %&gt;% filter(RaceHispOrigin != &quot;Hispanic&quot; | is.na(RaceHispOrigin)) %&gt;% count(RaceHispOrigin, V3023A) ## # A tibble: 20 × 3 ## RaceHispOrigin V3023A n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 White 1 198175 ## 2 Black 2 30067 ## 3 Asian 4 16076 ## 4 Native Hawaiian or Other Pacific Islander 5 900 ## 5 Other 3 1319 ## 6 Other 6 1217 ## 7 Other 7 1025 ## 8 Other 8 837 ## 9 Other 9 184 ## 10 Other 10 178 ## 11 Other 11 87 ## 12 Other 12 27 ## 13 Other 13 13 ## 14 Other 14 53 ## 15 Other 15 136 ## 16 Other 16 45 ## 17 Other 17 11 ## 18 Other 18 33 ## 19 Other 19 22 ## 20 Other 20 23 pers_vsum_der %&gt;% group_by(AgeGroup) %&gt;% summarize(minAge = min(V3014), maxAge = max(V3014), .groups = &quot;drop&quot;) ## # A tibble: 6 × 3 ## AgeGroup minAge maxAge ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 12-17 12 17 ## 2 18-24 18 24 ## 3 25-34 25 34 ## 4 35-49 35 49 ## 5 50-64 50 64 ## 6 65 or older 65 90 pers_vsum_der %&gt;% count(MaritalStatus, V3015) ## # A tibble: 6 × 3 ## MaritalStatus V3015 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Never married 5 90425 ## 2 Married 1 148131 ## 3 Widowed 2 17668 ## 4 Divorced 3 28596 ## 5 Separated 4 4524 ## 6 &lt;NA&gt; 8 2534 pers_vsum_slim &lt;- pers_vsum_der %&gt;% select(YEARQ:WGTPERCY, WGTVICCY:ADJINC_WT, Sex:Region) The tibbles hh_vsum_slim and pers_vsum_slim can now be used to create design objects and calculate crime rate estimates. To calculate estimates about types of crime, such as what percentage of violent crimes are reported to the police, we must use the incident file. The incident file is not guaranteed to have every pseudostratum and half-sample code, so dummy records are created to append before estimation. Finally, demographic variables are merged onto the incident tibble. dummy_records &lt;- hh_vsum_slim %&gt;% distinct(V2117, V2118) %&gt;% mutate(Dummy = 1, WGTVICCY = 1, NEWWGT = 1) inc_analysis &lt;- inc_ind %&gt;% mutate(Dummy = 0) %&gt;% left_join(select(pers_vsum_slim, YEARQ, IDHH, IDPER, Sex:Region), by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;, &quot;IDPER&quot;)) %&gt;% bind_rows(dummy_records) %&gt;% select(YEARQ:IDPER, WGTVICCY, NEWWGT, V4529, WeapCat, ReportPolice, Property:Region) 9.5 Survey design objects All the data prep above is necessary to prepare the data for survey analysis. At this point, we can create the design objects and finally begin analysis. We will create three design objects for different types of analysis as they depend on which type of estimate we are creating. For the incident data, the weight of analysis is NEWWGT, which we constructed previously. The household and person data use WGTHHCY and WGTPERCY, respectively. For all analyses, V2117 is the strata variable, and V2118 is the cluster variable for analysis. inc_des &lt;- inc_analysis %&gt;% as_survey( weight = NEWWGT, strata = V2117, ids = V2118, nest = TRUE ) hh_des &lt;- hh_vsum_slim %&gt;% as_survey( weight = WGTHHCY, strata = V2117, ids = V2118, nest = TRUE ) pers_des &lt;- pers_vsum_slim %&gt;% as_survey( weight = WGTPERCY, strata = V2117, ids = V2118, nest = TRUE ) 9.6 Calculating estimates Now that we have prepared our data, we can calculate our estimates. As a reminder, those are: Victimization totals estimate the number of criminal victimizations with a given characteristic. Victimization proportions estimate characteristics among victimizations or victims. Victimization rates are estimates of the number of victimizations per 1,000 persons or households in the population. Prevalence rates are estimates of the percentage of the population (persons or households) who are victims of a crime. 9.6.1 Victimization totals There are two ways to calculate victimization totals. Using the incident design object is the most straightforward method, but the person and household design objects can be used as well if the adjustment factor is incorporated. In the example below, the total number of property and violent victimizations are first calculated using the incident file and then using the household and person design objects. The incident file is smaller, and thus, estimation is faster using that file, but the estimates will be the same as illustrated below: inc_des %&gt;% summarize( Property_Vzn = survey_total(Property, na.rm = TRUE), Violent_Vzn = survey_total(Violent, na.rm = TRUE) ) ## # A tibble: 1 × 4 ## Property_Vzn Property_Vzn_se Violent_Vzn Violent_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. 4598306. 198115. hh_des %&gt;% summarize(Property_Vzn = survey_total(Property * ADJINC_WT, na.rm = TRUE)) ## # A tibble: 1 × 2 ## Property_Vzn Property_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. pers_des %&gt;% summarize(Violent_Vzn = survey_total(Violent * ADJINC_WT, na.rm = TRUE)) ## # A tibble: 1 × 2 ## Violent_Vzn Violent_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 4598306. 198115. 9.6.2 Victimization proportions Victimization proportions are proportions describing features of a victimization. The key here is that these are questions among victimizations, not among the population. These types of estimates can only be calculated using the incident design object. These include questions such as: What proportion of property victimizations are reported to the police? What proportion of violent victimizations are in urban areas? inc_des %&gt;% filter(Property) %&gt;% group_by(ReportPolice) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ## # A tibble: 2 × 3 ## ReportPolice Pct Pct_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 69.2 0.798 ## 2 TRUE 30.8 0.798 inc_des %&gt;% filter(Violent) %&gt;% group_by(Urbanicity) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ## # A tibble: 3 × 3 ## Urbanicity Pct Pct_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Urban 18.1 1.49 ## 2 Suburban 69.3 1.96 ## 3 Rural 12.7 1.45 9.6.3 Victimization rates Victimization rates measure the number of victimizations per population and are not an estimate of the proportion of households or persons who are victimized, which is a prevalence rate described in the next section. Victimization rates are estimated using the household or person design objects depending on the type of crime, and the adjustment factor must be incorporated. We return to the example of property and violent victimizations used in the example for victimization totals. In the following example, the property victimization totals are calculated as above, as well as the property victimization rate, using survey_mean(), and the population size using survey_total(). hh_des %&gt;% summarize( Property_Vzn = survey_total(Property * ADJINC_WT, na.rm = TRUE), Property_Rate = survey_mean(Property * ADJINC_WT * 1000, na.rm = TRUE), PopSize = survey_total(1, vartype = NULL) ) ## # A tibble: 1 × 5 ## Property_Vzn Property_Vzn_se Property_Rate Property_Rate_se PopSize ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. 90.3 1.95 129319232. Victimization rates can also be calculated for particular characteristics of the victimization. In the following example, the rate of aggravated assault with no weapon, with a firearm, with a knife, and with another weapon. pers_des %&gt;% summarize(across( starts_with(&quot;AAST_&quot;), ~ survey_mean(. * ADJINC_WT * 1000, na.rm = TRUE) )) ## # A tibble: 1 × 8 ## AAST_NoWeap AAST_NoWeap_se AAST_Firearm AAST_Firearm_se AAST_Knife ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.249 0.0595 0.860 0.101 0.455 ## # ℹ 3 more variables: AAST_Knife_se &lt;dbl&gt;, AAST_Other &lt;dbl&gt;, ## # AAST_Other_se &lt;dbl&gt; A common desire is to calculate victimization rates by several characteristics. For example, we may want to calculate the violent victimization rate and aggravated assault rate by sex, race/Hispanic origin, age group, marital status, and household income. This requires a group_by() statement for each categorization separately. Thus, we make a function to do this and then use map_df() from the {purrr} package to loop through the variables. Finally, the {gt} package is used to make a publishable table. pers_est_by &lt;- function(byvar) { pers_des %&gt;% rename(Level := { { byvar } }) %&gt;% filter(!is.na(Level)) %&gt;% group_by(Level) %&gt;% summarize( Violent = survey_mean(Violent * ADJINC_WT * 1000, na.rm = TRUE), AAST = survey_mean(AAST * ADJINC_WT * 1000, na.rm = TRUE) ) %&gt;% mutate( Variable = byvar, LevelNum = as.numeric(Level), Level = as.character(Level) ) %&gt;% select(Variable, Level, LevelNum, everything()) } pers_est_df &lt;- c(&quot;Sex&quot;, &quot;RaceHispOrigin&quot;, &quot;AgeGroup&quot;, &quot;MaritalStatus&quot;, &quot;Income&quot;) %&gt;% map_df(pers_est_by) pers_est_df %&gt;% mutate( Variable = case_when( Variable == &quot;RaceHispOrigin&quot; ~ &quot;Race/Hispanic origin&quot;, Variable == &quot;MaritalStatus&quot; ~ &quot;Marital status&quot;, Variable == &quot;AgeGroup&quot; ~ &quot;Age&quot;, TRUE ~ Variable ) ) %&gt;% select(-LevelNum) %&gt;% group_by(Variable) %&gt;% gt(rowname_col = &quot;Level&quot;) %&gt;% tab_spanner( label = &quot;Violent crime&quot;, id = &quot;viol_span&quot;, columns = c(&quot;Violent&quot;, &quot;Violent_se&quot;) ) %&gt;% tab_spanner(label = &quot;Aggravated assault&quot;, columns = c(&quot;AAST&quot;, &quot;AAST_se&quot;)) %&gt;% cols_label( Violent = &quot;Rate&quot;, Violent_se = &quot;SE&quot;, AAST = &quot;Rate&quot;, AAST_se = &quot;SE&quot;, ) %&gt;% fmt_number( columns = c(&quot;Violent&quot;, &quot;Violent_se&quot;, &quot;AAST&quot;, &quot;AAST_se&quot;), decimals = 1 ) %&gt;% tab_footnote( footnote = &quot;Includes rape or sexual assault, robbery, aggravated assault, and simple assault.&quot;, locations = cells_column_spanners(spanners = &quot;viol_span&quot;) ) %&gt;% tab_footnote(footnote = &quot;Excludes persons of Hispanic origin&quot;, locations = cells_stub(rows = Level != &quot;Hispanic&quot;)) %&gt;% tab_footnote( footnote = &quot;Inlcudes persons who identified as Native Hawaiian or Other Pacific Islander only.&quot;, locations = cells_stub(rows = Level == NHOPI) ) %&gt;% tab_footnote( footnote = &quot;Inlcudes persons who identified as American Indian or Alaska Native only or as two or more races.&quot;, locations = cells_stub(rows = Level == &quot;Other&quot;) ) %&gt;% tab_source_note(source_note = &quot;Note: Rates per 1,000 persons age 12 or older.&quot;) %&gt;% tab_source_note(source_note = &quot;Source: Bureau of Justice Statistics, National Crime Victimization Survey, 2021.&quot;) %&gt;% tab_stubhead(label = &quot;Victim demographic&quot;) %&gt;% tab_header(title = &quot;Rate and standard error of violent victimization, by type of crime and demographic characteristics, 2021&quot;) #wvnyhqfyuk table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #wvnyhqfyuk thead, #wvnyhqfyuk tbody, #wvnyhqfyuk tfoot, #wvnyhqfyuk tr, #wvnyhqfyuk td, #wvnyhqfyuk th { border-style: none; } #wvnyhqfyuk p { margin: 0; padding: 0; } #wvnyhqfyuk .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #wvnyhqfyuk .gt_caption { padding-top: 4px; padding-bottom: 4px; } #wvnyhqfyuk .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #wvnyhqfyuk .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #wvnyhqfyuk .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #wvnyhqfyuk .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #wvnyhqfyuk .gt_column_spanner_outer:first-child { padding-left: 0; } #wvnyhqfyuk .gt_column_spanner_outer:last-child { padding-right: 0; } #wvnyhqfyuk .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #wvnyhqfyuk .gt_spanner_row { border-bottom-style: hidden; } #wvnyhqfyuk .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #wvnyhqfyuk .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #wvnyhqfyuk .gt_from_md > :first-child { margin-top: 0; } #wvnyhqfyuk .gt_from_md > :last-child { margin-bottom: 0; } #wvnyhqfyuk .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #wvnyhqfyuk .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #wvnyhqfyuk .gt_row_group_first td { border-top-width: 2px; } #wvnyhqfyuk .gt_row_group_first th { border-top-width: 2px; } #wvnyhqfyuk .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #wvnyhqfyuk .gt_first_summary_row.thick { border-top-width: 2px; } #wvnyhqfyuk .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #wvnyhqfyuk .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #wvnyhqfyuk .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_left { text-align: left; } #wvnyhqfyuk .gt_center { text-align: center; } #wvnyhqfyuk .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #wvnyhqfyuk .gt_font_normal { font-weight: normal; } #wvnyhqfyuk .gt_font_bold { font-weight: bold; } #wvnyhqfyuk .gt_font_italic { font-style: italic; } #wvnyhqfyuk .gt_super { font-size: 65%; } #wvnyhqfyuk .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #wvnyhqfyuk .gt_asterisk { font-size: 100%; vertical-align: 0; } #wvnyhqfyuk .gt_indent_1 { text-indent: 5px; } #wvnyhqfyuk .gt_indent_2 { text-indent: 10px; } #wvnyhqfyuk .gt_indent_3 { text-indent: 15px; } #wvnyhqfyuk .gt_indent_4 { text-indent: 20px; } #wvnyhqfyuk .gt_indent_5 { text-indent: 25px; } Rate and standard error of violent victimization, by type of crime and demographic characteristics, 2021 Victim demographic Violent crime1 Aggravated assault Rate SE Rate SE Sex Female2 15.5 0.9 2.3 0.2 Male2 17.5 1.1 3.2 0.3 Race/Hispanic origin White2 16.1 0.9 2.7 0.3 Black2 18.5 2.2 3.7 0.7 Hispanic 15.9 1.7 2.3 0.4 Asian2 8.6 1.3 1.9 0.6 Native Hawaiian or Other Pacific Islander2,3 36.1 34.4 0.0 0.0 Other2,4 45.4 13.0 6.2 2.0 Age 12-172 13.2 2.2 2.5 0.8 18-242 23.1 2.1 3.9 0.9 25-342 22.0 2.1 4.0 0.6 35-492 19.4 1.6 3.6 0.5 50-642 16.9 1.9 2.0 0.3 65 or older2 6.4 1.1 1.1 0.3 Marital status Never married2 22.2 1.4 4.0 0.4 Married2 9.5 0.9 1.5 0.2 Widowed2 10.7 3.5 0.9 0.2 Divorced2 27.4 2.9 4.0 0.7 Separated2 36.8 6.7 8.8 3.1 Income Less than $25,0002 29.6 2.5 5.1 0.7 $25,000-49,9992 16.9 1.5 3.0 0.4 $50,000-99,9992 14.6 1.1 1.9 0.3 $100,000-199,9992 12.2 1.3 2.5 0.4 $200,000 or more2 9.7 1.4 1.7 0.6 Note: Rates per 1,000 persons age 12 or older. Source: Bureau of Justice Statistics, National Crime Victimization Survey, 2021. 1 Includes rape or sexual assault, robbery, aggravated assault, and simple assault. 2 Excludes persons of Hispanic origin 3 Inlcudes persons who identified as Native Hawaiian or Other Pacific Islander only. 4 Inlcudes persons who identified as American Indian or Alaska Native only or as two or more races. 9.6.4 Prevalence rates Prevalence rates are different from victimization rates as the numerator is the number of people or households who are victimized rather than the number of victimizations. To calculate the prevalence rates, another summary of the data must be done, which is calculating an indicator for whether a person or household is a victim of a particular crime at any point in the year. Below is an example of calculating first the indicator and then the prevalence rate of violent crime and aggravated assault. pers_prev_des &lt;- pers_des %&gt;% mutate(Year = floor(YEARQ)) %&gt;% group_by(Year, IDHH, IDPER) %&gt;% mutate(Violent_Ind = sum(Violent) &gt; 0, AAST_Ind = sum(AAST) &gt; 0) %&gt;% ungroup() pers_prev_ests &lt;- pers_prev_des %&gt;% summarize(Violent_Prev = survey_mean(Violent_Ind * 100), AAST_Prev = survey_mean(AAST_Ind * 100)) pers_prev_ests ## # A tibble: 1 × 4 ## Violent_Prev Violent_Prev_se AAST_Prev AAST_Prev_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.980 0.0349 0.215 0.0143 In the example above, the indicator is multiplied by 100 to return a percentage rather than a proportion. In 2021, we estimate that 0.98% of people aged 12 and older were a victim of violent crime in the United States, and 0.22% were victims of aggravated assault. 9.7 Exercises What proportion of completed motor vehicle thefts are not reported to the police? Hint: Use the codebook to look at the definition of Type of Crime (V4529). ans1 &lt;- inc_des %&gt;% filter(str_detect(V4529, &quot;40|41&quot;)) %&gt;% group_by(ReportPolice) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ans1 ## # A tibble: 2 × 3 ## ReportPolice Pct Pct_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 23.1 2.60 ## 2 TRUE 76.9 2.60 ans1 %&gt;% filter(!ReportPolice) %&gt;% pull(Pct) ## [1] 23.11 How many violent crimes occur in each region? inc_des %&gt;% filter(Violent) %&gt;% survey_count(Region) ## # A tibble: 4 × 3 ## Region n n_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 698406. 82419. ## 2 Midwest 1144407. 95860. ## 3 South 1394214. 107505. ## 4 West 1361278. 109479. What is the property victimization rate among each income level? hh_des %&gt;% group_by(Income) %&gt;% summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000, na.rm = TRUE)) ## # A tibble: 6 × 3 ## Income Property_Rate Property_Rate_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than $25,000 111. 4.97 ## 2 $25,000-49,999 89.5 3.42 ## 3 $50,000-99,999 87.8 3.30 ## 4 $100,000-199,999 76.5 3.49 ## 5 $200,000 or more 91.8 5.69 ## 6 &lt;NA&gt; NaN NaN References "],["c10-ambarom-vignette.html", "Chapter 10 AmericasBarometer Vignette 10.1 Introduction 10.2 Data Structure 10.3 Preparing files 10.4 Survey design objects 10.5 Calculating estimates and making tables 10.6 Mapping survey data 10.7 Exercises", " Chapter 10 AmericasBarometer Vignette Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(srvyr) library(sf) library(rnaturalearth) # Getting world maps library(rnaturalearthdata) library(gt) library(ggpattern) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from the AmericasBarometer surveys. Here is the code to read in the dataset that we will be working with: ambarom_in &lt;- read_osf(&quot;lapop_2021.rds&quot;) 10.1 Introduction The AmericasBarometer surveys are conducted by the LAPOP Lab. These surveys are public opinion surveys of the Americas focused on democracy. The study was launched in 2004/2005 with 11 countries, with the countries growing and fluctuating over time, and creates a study with consistent methodology across many countries. In 2021, the study included 22 countries ranging from the north in Canada to the South in Chile and Argentina31. Historically, surveys were administered with face-to-face household interviews, but the COVID-19 pandemic changed the study significantly to the use of random-digit dialing (RDD) of mobile phones in all countries except the United States and Canada32. In Canada, LAPOP collaborated with the Environics Institute to collect data from a panel of Canadians using a web survey33. While in the United States, YouGov conducted the survey on behalf of LAPOP by conducting a web survey among their panelists34. The survey has a core set of questions across the countries, but not all questions are asked everywhere. Additionally, some questions are only asked to half of the respondents within a country, presumably to reduce the burden as different sections are randomized to different respondents.35 10.2 Data Structure Each country and each year has its own files. The data used in this vignette can be downloaded from the LAPOP website. In this vignette, we will be using data from 2021, namely version v1.2. These are not available on the book’s repository, but you may download the raw files yourself36 (“The AmericasBarometer by the LAPOP Lab” (2023)). To read all files into R and ignore the Stata labels, we recommend running code like this: stata_files &lt;- list.files(here(&quot;RawData&quot;, &quot;LAPOP_2021&quot;), &quot;*.dta&quot;) read_stata_unlabeled &lt;- function(file) { read_stata(file) %&gt;% zap_labels() %&gt;% zap_label() } lapop_in &lt;- here(&quot;RawData&quot;, &quot;LAPOP_2021&quot;, stata_files) %&gt;% map_df(read_stata_unlabeled) The code above will read all files of type dta in and stack them into one tibble. We did this and then selected a subset of variables for this vignette. To understand variables that are used across the several countries, the core questionnaire is useful.37 10.3 Preparing files Many of the variables are coded as numeric and do not have intuitive variable names, so the next step is to create derived variables and analysis-ready data. Using the core questionnaire as a codebook, derived variables are created below with relevant factors with informative names. ambarom &lt;- ambarom_in %&gt;% mutate( Country = factor( case_match( pais, 1 ~ &quot;Mexico&quot;, 2 ~ &quot;Guatemala&quot;, 3 ~ &quot;El Salvador&quot;, 4 ~ &quot;Honduras&quot;, 5 ~ &quot;Nicaragua&quot;, 6 ~ &quot;Costa Rica&quot;, 7 ~ &quot;Panama&quot;, 8 ~ &quot;Colombia&quot;, 9 ~ &quot;Ecuador&quot;, 10 ~ &quot;Bolivia&quot;, 11 ~ &quot;Peru&quot;, 12 ~ &quot;Paraguay&quot;, 13 ~ &quot;Chile&quot;, 14 ~ &quot;Uruguay&quot;, 15 ~ &quot;Brazil&quot;, 17 ~ &quot;Argentina&quot;, 21 ~ &quot;Dominican Republic&quot;, 22 ~ &quot;Haiti&quot;, 23 ~ &quot;Jamaica&quot;, 24 ~ &quot;Guyana&quot;, 40 ~ &quot;United States&quot;, 41 ~ &quot;Canada&quot; ) ), Gender = fct_reorder( case_match(q1tb, 1 ~ &quot;Male&quot;, 2 ~ &quot;Female&quot;, 3 ~ &quot;Other&quot;), q1tb, .na_rm = FALSE ), CovidWorry = fct_reorder( case_match( covid2at, 1 ~ &quot;Very worried&quot;, 2 ~ &quot;Somewhat worried&quot;, 3 ~ &quot;A little worried&quot;, 4 ~ &quot;Not worried at all&quot; ), covid2at, .na_rm = FALSE ), EconSituation = fct_reorder( case_match(idio2, 1 ~ &quot;Better&quot;, 2 ~ &quot;Same&quot;, 3 ~ &quot;Worse&quot;), idio2, .na_rm = FALSE ), EconSituationWorse_Reason = fct_reorder( case_match(idio2cov, 1 ~ &quot;Coronavirus&quot;, 2 ~ &quot;Another reason&quot;), idio2cov, .na_rm = FALSE ), CommunityTrustworthy = fct_reorder( case_match( it1, 1 ~ &quot;Very trustworthy&quot;, 2 ~ &quot;Somewhat trustworthy&quot;, 3 ~ &quot;Not very trustworthy&quot;, 4 ~ &quot;Untrustworthy&quot; ), it1, .na_rm = FALSE ), CoupCorruption = fct_reorder( case_match(jc13, 1 ~ &quot;Justified&quot;, 2 ~ &quot;Not justified&quot;), jc13, .na_rm = FALSE ), LeaderApproval = fct_reorder( case_match( m1, 1 ~ &quot;Very good&quot;, 2 ~ &quot;Good&quot;, 3 ~ &quot;Neither good nor bad (fair)&quot;, 4 ~ &quot;Bad&quot;, 5 ~ &quot;Very bad&quot; ), m1, .na_rm = FALSE ), Employment = fct_reorder( case_match( ocup4a, c(1, 2) ~ &quot;Working&quot;, 3 ~ &quot;Looking for a job&quot;, 4 ~ &quot;Student&quot;, 5 ~ &quot;Homemaker&quot;, 6 ~ &quot;Retired or disabled&quot;, 7 ~ &quot;Not working, not looking for job&quot; ), ocup4a, .na_rm = FALSE ), IntentionMigrate = fct_reorder(case_match(q14, 1 ~ &quot;Yes&quot;, 2 ~ &quot;No&quot;), q14, .na_rm = FALSE), NewsFrequency = fct_reorder( case_match( gi0n, 1 ~ &quot;Daily&quot;, 2 ~ &quot;Few times a week&quot;, 3 ~ &quot;Few times a month&quot;, 4 ~ &quot;Few times a year&quot;, 5 ~ &quot;Never&quot; ), gi0n, .na_rm = FALSE ) ) %&gt;% rename( Educ_NotInSchool = covidedu1_1, Educ_NormalSchool = covidedu1_2, Educ_VirtualSchool = covidedu1_3, Educ_Hybrid = covidedu1_4, Educ_NoSchool = covidedu1_5, Age = q2, HHSize = q12c, HHChildren = q12bn, ComputerTablet = r15, BroadbandInternet = r18n, Internet = r18 ) ambarom %&gt;% count(Country, pais) %&gt;% print(n = 22) ## # A tibble: 22 × 3 ## Country pais n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Argentina 17 3011 ## 2 Bolivia 10 3002 ## 3 Brazil 15 3016 ## 4 Canada 41 2201 ## 5 Chile 13 2954 ## 6 Colombia 8 2993 ## 7 Costa Rica 6 2977 ## 8 Dominican Republic 21 3000 ## 9 Ecuador 9 3005 ## 10 El Salvador 3 3245 ## 11 Guatemala 2 3000 ## 12 Guyana 24 3011 ## 13 Haiti 22 3088 ## 14 Honduras 4 2999 ## 15 Jamaica 23 3121 ## 16 Mexico 1 2998 ## 17 Nicaragua 5 2997 ## 18 Panama 7 3183 ## 19 Paraguay 12 3004 ## 20 Peru 11 3038 ## 21 United States 40 1500 ## 22 Uruguay 14 3009 ambarom %&gt;% count(Gender, q1tb) ## # A tibble: 4 × 3 ## Gender q1tb n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Male 1 30902 ## 2 Female 2 33208 ## 3 Other 3 152 ## 4 &lt;NA&gt; NA 90 ambarom %&gt;% count(EconSituation, idio2) ## # A tibble: 4 × 3 ## EconSituation idio2 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Better 1 7194 ## 2 Same 2 20143 ## 3 Worse 3 34249 ## 4 &lt;NA&gt; NA 2766 ambarom %&gt;% count(EconSituationWorse_Reason, idio2cov) ## # A tibble: 3 × 3 ## EconSituationWorse_Reason idio2cov n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Coronavirus 1 24839 ## 2 Another reason 2 7933 ## 3 &lt;NA&gt; NA 31580 ambarom %&gt;% count(CommunityTrustworthy, it1) ## # A tibble: 5 × 3 ## CommunityTrustworthy it1 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Very trustworthy 1 13299 ## 2 Somewhat trustworthy 2 23233 ## 3 Not very trustworthy 3 18383 ## 4 Untrustworthy 4 5806 ## 5 &lt;NA&gt; NA 3631 ambarom %&gt;% count(CoupCorruption, jc13) ## # A tibble: 3 × 3 ## CoupCorruption jc13 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Justified 1 5138 ## 2 Not justified 2 8387 ## 3 &lt;NA&gt; NA 50827 ambarom %&gt;% count(LeaderApproval, m1) ## # A tibble: 6 × 3 ## LeaderApproval m1 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Very good 1 4377 ## 2 Good 2 7364 ## 3 Neither good nor bad (fair) 3 9731 ## 4 Bad 4 3854 ## 5 Very bad 5 5788 ## 6 &lt;NA&gt; NA 33238 ambarom %&gt;% count(Employment, ocup4a) ## # A tibble: 8 × 3 ## Employment ocup4a n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Working 1 17548 ## 2 Working 2 1594 ## 3 Looking for a job 3 5263 ## 4 Student 4 2430 ## 5 Homemaker 5 4113 ## 6 Retired or disabled 6 2547 ## 7 Not working, not looking for job 7 1352 ## 8 &lt;NA&gt; NA 29505 ambarom %&gt;% count(IntentionMigrate, q14) ## # A tibble: 3 × 3 ## IntentionMigrate q14 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Yes 1 7992 ## 2 No 2 12230 ## 3 &lt;NA&gt; NA 44130 ambarom %&gt;% count(NewsFrequency, gi0n) ## # A tibble: 6 × 3 ## NewsFrequency gi0n n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Daily 1 34642 ## 2 Few times a week 2 21804 ## 3 Few times a month 3 3434 ## 4 Few times a year 4 810 ## 5 Never 5 2422 ## 6 &lt;NA&gt; NA 1240 10.4 Survey design objects The technical report is the best source to understand how to specify the sampling design in R38. The data includes two weights: wt and weight1500. The first weight variable is country-specific and sums to the sample size but is calibrated to reflect each country’s demographics, while the second weight variable sums to 1500 for each country. The second weight is indicated as the weight to use for multi-country analyses. While the documentation does not directly state this, the example Stata syntax svyset upm [pw=weight1500], strata(strata) indicates the variable upm is a clustering variable, and strata is the strata variable. The design object is setup as follows: ambarom_des &lt;- ambarom %&gt;% as_survey_design(ids = upm, strata = strata, weight = weight1500) One interesting thing to note is that these can only give us estimates to compare countries but not multi-country estimates since the weights do not account for different sizes of countries. For example, Canada has about 10% of the population of the United States, but an estimate that uses records from both countries would weigh them equally. 10.5 Calculating estimates and making tables This survey was administered in 2021 between March and August, varying by country39. Given the state of the pandemic at that time, several questions about COVID were included. The first question about COVID asked whether people were worried about the possibility that they or someone in their household will get sick from coronavirus in the next three months. We will calculate the percentage of people in each country who are very worried or somewhat worried. In the following code, estimates are calculated, and then a table of the estimates is created using the {{gt}} package. covid_worry_country_ests &lt;- ambarom_des %&gt;% mutate(CovidWorry_bin = fct_collapse( CovidWorry, WorriedHi = c(&quot;Very worried&quot;, &quot;Somewhat worried&quot;), WorriedLo = c(&quot;A little worried&quot;, &quot;Not worried at all&quot;) )) %&gt;% group_by(Country) %&gt;% summarize(p = survey_mean(CovidWorry_bin == &quot;WorriedHi&quot;, na.rm = TRUE) * 100) covid_worry_country_ests %&gt;% gt(rowname_col = &quot;Country&quot;) %&gt;% cols_label(p = &quot;Percent&quot;, p_se = &quot;SE&quot;) %&gt;% tab_header(title = &quot;Proportion worried about the possibility that they or someone in their household will get sick from coronavirus in the next 3 months&quot;) %&gt;% fmt_number(decimals = 1) #qivcbyivmm table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qivcbyivmm thead, #qivcbyivmm tbody, #qivcbyivmm tfoot, #qivcbyivmm tr, #qivcbyivmm td, #qivcbyivmm th { border-style: none; } #qivcbyivmm p { margin: 0; padding: 0; } #qivcbyivmm .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qivcbyivmm .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qivcbyivmm .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qivcbyivmm .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qivcbyivmm .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qivcbyivmm .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qivcbyivmm .gt_column_spanner_outer:first-child { padding-left: 0; } #qivcbyivmm .gt_column_spanner_outer:last-child { padding-right: 0; } #qivcbyivmm .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qivcbyivmm .gt_spanner_row { border-bottom-style: hidden; } #qivcbyivmm .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qivcbyivmm .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qivcbyivmm .gt_from_md > :first-child { margin-top: 0; } #qivcbyivmm .gt_from_md > :last-child { margin-bottom: 0; } #qivcbyivmm .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qivcbyivmm .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qivcbyivmm .gt_row_group_first td { border-top-width: 2px; } #qivcbyivmm .gt_row_group_first th { border-top-width: 2px; } #qivcbyivmm .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qivcbyivmm .gt_first_summary_row.thick { border-top-width: 2px; } #qivcbyivmm .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qivcbyivmm .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qivcbyivmm .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_left { text-align: left; } #qivcbyivmm .gt_center { text-align: center; } #qivcbyivmm .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qivcbyivmm .gt_font_normal { font-weight: normal; } #qivcbyivmm .gt_font_bold { font-weight: bold; } #qivcbyivmm .gt_font_italic { font-style: italic; } #qivcbyivmm .gt_super { font-size: 65%; } #qivcbyivmm .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qivcbyivmm .gt_asterisk { font-size: 100%; vertical-align: 0; } #qivcbyivmm .gt_indent_1 { text-indent: 5px; } #qivcbyivmm .gt_indent_2 { text-indent: 10px; } #qivcbyivmm .gt_indent_3 { text-indent: 15px; } #qivcbyivmm .gt_indent_4 { text-indent: 20px; } #qivcbyivmm .gt_indent_5 { text-indent: 25px; } Proportion worried about the possibility that they or someone in their household will get sick from coronavirus in the next 3 months Percent SE Argentina 65.8 1.1 Bolivia 71.6 1.0 Brazil 83.5 1.0 Canada 48.9 1.3 Chile 81.8 0.8 Colombia 67.9 1.1 Costa Rica 72.6 1.0 Dominican Republic 50.1 1.1 Ecuador 71.7 1.0 El Salvador 52.5 1.0 Guatemala 69.3 1.0 Guyana 60.0 1.6 Haiti 54.4 1.8 Honduras 64.6 1.1 Jamaica 28.4 0.9 Mexico 63.6 1.0 Nicaragua 80.0 1.0 Panama 70.2 1.0 Paraguay 61.5 1.1 Peru 77.1 2.5 United States 46.6 1.7 Uruguay 60.9 1.1 Another question asked how education was affected by the pandemic. This question was asked among households with children under the age of 13, and respondents could select more than one option as follows: Did any of these children have their school education affected due to the pandemic? No, because they are not yet school age or because they do not attend school for another reason No, their classes continued normally Yes, they went to virtual or remote classes Yes, they switched to a combination of virtual and in-person classes Yes, they cut all ties with the school Multiple-choice questions are interesting. If we want to look at how education was impacted only among those in school, we need to filter to the relevant responses, which is anyone that responded no to the first part. An unweighted cross-tab for the responses is included below, and we can see there is a wide-range of impacts and that many combinations of effects on education are possible. ambarom %&gt;% filter(Educ_NotInSchool == 0) %&gt;% count(Educ_NormalSchool, Educ_VirtualSchool, Educ_Hybrid) %&gt;% print(n = 50) ## # A tibble: 8 × 4 ## Educ_NormalSchool Educ_VirtualSchool Educ_Hybrid n ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 0 0 0 861 ## 2 0 0 1 1192 ## 3 0 1 0 7554 ## 4 0 1 1 280 ## 5 1 0 0 833 ## 6 1 0 1 18 ## 7 1 1 0 72 ## 8 1 1 1 7 We might create multiple outcomes for a table as follows: Indicator that school continued as normal with no virtual or hybrid option Indicator that the education medium was changed - either virtual or hybrid We create these variables, make national estimates, and a summary table. ambarom_des_educ &lt;- ambarom_des %&gt;% filter(Educ_NotInSchool == 0) %&gt;% mutate( Educ_OnlyNormal = Educ_NormalSchool == 1 &amp; Educ_VirtualSchool == 0 &amp; Educ_Hybrid == 0, Educ_MediumChange = Educ_VirtualSchool == 1 | Educ_Hybrid == 1, ) covid_educ_ests &lt;- ambarom_des_educ %&gt;% group_by(Country) %&gt;% summarize( p_onlynormal = survey_mean(Educ_OnlyNormal, na.rm = TRUE) * 100, p_mediumchange = survey_mean(Educ_MediumChange, na.rm = TRUE) * 100, p_noschool = survey_mean(Educ_NoSchool, na.rm = TRUE) * 100, ) covid_educ_ests %&gt;% gt(rowname_col = &quot;Country&quot;) %&gt;% cols_label( p_onlynormal = &quot;%&quot;, p_onlynormal_se = &quot;SE&quot;, p_mediumchange = &quot;%&quot;, p_mediumchange_se = &quot;SE&quot;, p_noschool = &quot;%&quot;, p_noschool_se = &quot;SE&quot; ) %&gt;% tab_spanner(label = &quot;Normal school only&quot;, columns = c(&quot;p_onlynormal&quot;, &quot;p_onlynormal_se&quot;)) %&gt;% tab_spanner(label = &quot;Medium change&quot;, columns = c(&quot;p_mediumchange&quot;, &quot;p_mediumchange_se&quot;)) %&gt;% tab_spanner(label = &quot;Cut ties with school&quot;, columns = c(&quot;p_noschool&quot;, &quot;p_noschool_se&quot;)) %&gt;% fmt_number(decimals = 1) #dvpqxzjwgf table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #dvpqxzjwgf thead, #dvpqxzjwgf tbody, #dvpqxzjwgf tfoot, #dvpqxzjwgf tr, #dvpqxzjwgf td, #dvpqxzjwgf th { border-style: none; } #dvpqxzjwgf p { margin: 0; padding: 0; } #dvpqxzjwgf .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #dvpqxzjwgf .gt_caption { padding-top: 4px; padding-bottom: 4px; } #dvpqxzjwgf .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #dvpqxzjwgf .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #dvpqxzjwgf .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #dvpqxzjwgf .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #dvpqxzjwgf .gt_column_spanner_outer:first-child { padding-left: 0; } #dvpqxzjwgf .gt_column_spanner_outer:last-child { padding-right: 0; } #dvpqxzjwgf .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #dvpqxzjwgf .gt_spanner_row { border-bottom-style: hidden; } #dvpqxzjwgf .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #dvpqxzjwgf .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #dvpqxzjwgf .gt_from_md > :first-child { margin-top: 0; } #dvpqxzjwgf .gt_from_md > :last-child { margin-bottom: 0; } #dvpqxzjwgf .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #dvpqxzjwgf .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #dvpqxzjwgf .gt_row_group_first td { border-top-width: 2px; } #dvpqxzjwgf .gt_row_group_first th { border-top-width: 2px; } #dvpqxzjwgf .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #dvpqxzjwgf .gt_first_summary_row.thick { border-top-width: 2px; } #dvpqxzjwgf .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #dvpqxzjwgf .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #dvpqxzjwgf .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_left { text-align: left; } #dvpqxzjwgf .gt_center { text-align: center; } #dvpqxzjwgf .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #dvpqxzjwgf .gt_font_normal { font-weight: normal; } #dvpqxzjwgf .gt_font_bold { font-weight: bold; } #dvpqxzjwgf .gt_font_italic { font-style: italic; } #dvpqxzjwgf .gt_super { font-size: 65%; } #dvpqxzjwgf .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #dvpqxzjwgf .gt_asterisk { font-size: 100%; vertical-align: 0; } #dvpqxzjwgf .gt_indent_1 { text-indent: 5px; } #dvpqxzjwgf .gt_indent_2 { text-indent: 10px; } #dvpqxzjwgf .gt_indent_3 { text-indent: 15px; } #dvpqxzjwgf .gt_indent_4 { text-indent: 20px; } #dvpqxzjwgf .gt_indent_5 { text-indent: 25px; } Normal school only Medium change Cut ties with school % SE % SE % SE Argentina 5.4 1.1 87.1 1.7 9.9 1.6 Brazil 4.3 1.2 81.5 2.3 22.1 2.5 Chile 0.7 0.3 96.2 1.0 4.0 1.0 Colombia 2.8 0.7 90.3 1.4 7.5 1.3 Dominican Republic 3.8 0.8 87.4 1.5 10.5 1.4 Ecuador 5.2 1.0 87.5 1.4 7.9 1.1 El Salvador 2.9 0.7 85.8 1.5 11.8 1.4 Guatemala 3.0 0.7 82.2 1.7 17.7 1.8 Guyana 3.3 0.7 85.3 1.7 13.0 1.6 Haiti 81.1 2.3 7.2 1.5 11.7 1.8 Honduras 3.7 0.9 80.7 1.7 16.9 1.6 Jamaica 5.4 0.9 88.1 1.4 7.5 1.2 Panama 7.2 1.2 89.4 1.4 3.8 0.9 Paraguay 4.7 0.9 90.7 1.4 6.4 1.2 Peru 2.0 0.6 91.8 1.2 6.8 1.1 Uruguay 8.6 1.4 84.3 2.0 8.0 1.6 Of the countries that used this data, many had households where their children had an education medium change, except Haiti, where only 7.2% of households with students changed to virtual or hybrid learning. 10.6 Mapping survey data While the table presents the data well, a map could also be used. To obtain maps of the countries, the package {{rnaturalearth}} is used, subsetting North and South America using the function ne_countries(). This returns an sf object with many columns but, most importantly soverignt (sovereignty), geounit (country or territory), and geometry (the shape). The United States, Puerto Rico, and the US Virgin Islands are all separate units with the same sovereignty. That map (without data) is plotted. The first map is very wide as the Aleutian islands in Alaska extend into the Eastern Hemisphere. The shape file is cropped to only the Western Hemisphere to remove some of the trailing islands of Alaska and then plotted. country_shape &lt;- ne_countries( scale = &quot;medium&quot;, returnclass = &quot;sf&quot;, continent = c(&quot;North America&quot;, &quot;South America&quot;) ) country_shape %&gt;% ggplot() + geom_sf() ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.1: Map of North and South America Then, using the anti_join() function, it is verified that all countries in the survey data are also in the map data. As shown below, the United States is referred to as “United States” in the survey data but “United States of America” in the map data. survey_country_list &lt;- ambarom %&gt;% distinct(Country) survey_country_list %&gt;% anti_join(country_shape, by = c(&quot;Country&quot; = &quot;geounit&quot;)) ## # A tibble: 1 × 1 ## Country ## &lt;fct&gt; ## 1 United States country_shape %&gt;% as_tibble() %&gt;% select(geounit, sovereignt) %&gt;% anti_join(survey_country_list, by = c(&quot;geounit&quot; = &quot;Country&quot;)) %&gt;% arrange(geounit) %&gt;% print(n = 30) ## # A tibble: 30 × 2 ## geounit sovereignt ## &lt;chr&gt; &lt;chr&gt; ## 1 Anguilla United Kingdom ## 2 Antigua and Barbuda Antigua and Barbuda ## 3 Aruba Netherlands ## 4 Barbados Barbados ## 5 Belize Belize ## 6 Bermuda United Kingdom ## 7 British Virgin Islands United Kingdom ## 8 Cayman Islands United Kingdom ## 9 Cuba Cuba ## 10 Curaçao Netherlands ## 11 Dominica Dominica ## 12 Falkland Islands United Kingdom ## 13 Greenland Denmark ## 14 Grenada Grenada ## 15 Montserrat United Kingdom ## 16 Puerto Rico United States of America ## 17 Saint Barthelemy France ## 18 Saint Kitts and Nevis Saint Kitts and Nevis ## 19 Saint Lucia Saint Lucia ## 20 Saint Martin France ## 21 Saint Pierre and Miquelon France ## 22 Saint Vincent and the Grenadines Saint Vincent and the Grenadines ## 23 Sint Maarten Netherlands ## 24 Suriname Suriname ## 25 The Bahamas The Bahamas ## 26 Trinidad and Tobago Trinidad and Tobago ## 27 Turks and Caicos Islands United Kingdom ## 28 United States Virgin Islands United States of America ## 29 United States of America United States of America ## 30 Venezuela Venezuela With the mismatched names, there are several ways to remedy the data to join later. The most straightforward fix is to rename the shape object’s data before merging. We then can plot the survey estimates after merging the data. country_shape_upd &lt;- country_shape %&gt;% mutate(geounit = if_else(geounit == &quot;United States of America&quot;, &quot;United States&quot;, geounit)) %&gt;% st_crop(c( xmin = -180, xmax = 0, ymin = -90, ymax = 90 )) ## Warning: attribute variables are assumed to be spatially constant ## throughout all geometries To merge the data and make a map, we begin with the map file, merge the estimates data, and then plot as shown below for the outcomes we have used above in tables. covid_sf &lt;- country_shape_upd %&gt;% full_join(covid_worry_country_ests, by = c(&quot;geounit&quot; = &quot;Country&quot;)) %&gt;% full_join(covid_educ_ests, by = c(&quot;geounit&quot; = &quot;Country&quot;)) ggplot() + geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_sf, is.na(p)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.2: Percent of people worried someone in their household will get COVID-19 in the next 3 months by country ggplot() + geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_sf, is.na(p_mediumchange)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.3: Percent of students who participated in virtual or hybrid learning Canada, Mexico, and the United States did not include this question, so removing North America from the map may make sense to focus on Central and South America. This is done below by restricting to Latin America and the Caribbean. covid_c_s &lt;- covid_sf %&gt;% filter(region_wb == &quot;Latin America &amp; Caribbean&quot;) ggplot() + geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_c_s, is.na(p_mediumchange)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.4: Percent of students who participated in virtual or hybrid learning, Central and South America 10.7 Exercises Calculate the percentage of households with broadband internet and those with any internet at home, including from phone or tablet. Hint: if you see countries with 0% Internet usage, you may want to filter by something first. int_ests &lt;- ambarom_des %&gt;% filter(!is.na(Internet) | !is.na(BroadbandInternet)) %&gt;% group_by(Country) %&gt;% summarize( p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100, p_internet = survey_mean(Internet, na.rm = TRUE) * 100 ) int_ests %&gt;% print(n = 30) ## # A tibble: 20 × 5 ## Country p_broadband p_broadband_se p_internet p_internet_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Argentina 62.3 1.13 86.2 0.871 ## 2 Bolivia 41.4 1.03 77.2 0.956 ## 3 Brazil 68.3 1.25 88.9 0.879 ## 4 Chile 63.1 1.06 93.5 0.550 ## 5 Colombia 45.7 1.15 68.7 1.09 ## 6 Costa Rica 49.6 1.07 84.4 0.798 ## 7 Dominican Republ… 37.1 1.04 73.7 1.05 ## 8 Ecuador 59.7 1.06 79.9 0.898 ## 9 El Salvador 30.2 0.906 63.9 0.985 ## 10 Guatemala 33.4 0.993 61.5 1.08 ## 11 Guyana 63.7 1.09 86.8 0.781 ## 12 Haiti 11.8 0.791 58.5 1.25 ## 13 Honduras 28.2 0.968 60.7 1.11 ## 14 Jamaica 64.2 0.986 91.5 0.602 ## 15 Mexico 44.9 1.05 70.9 1.05 ## 16 Nicaragua 39.1 1.12 76.3 1.09 ## 17 Panama 43.4 1.02 73.1 0.976 ## 18 Paraguay 33.3 0.971 72.9 1.01 ## 19 Peru 42.4 1.07 71.1 1.07 ## 20 Uruguay 62.7 1.08 90.6 0.699 Make a faceted map showing both broadband internet and any internet usage. internet_sf &lt;- country_shape_upd %&gt;% full_join(select(int_ests, p = p_internet, geounit = Country), by = &quot;geounit&quot;) %&gt;% mutate(Type = &quot;Internet&quot;) broadband_sf &lt;- country_shape_upd %&gt;% full_join(select(int_ests, p = p_broadband, geounit = Country), by = &quot;geounit&quot;) %&gt;% mutate(Type = &quot;Broadband&quot;) b_int_sf &lt;- internet_sf %&gt;% bind_rows(broadband_sf) %&gt;% filter(region_wb == &quot;Latin America &amp; Caribbean&quot;) b_int_sf %&gt;% ggplot(aes(fill = p)) + geom_sf() + facet_wrap( ~ Type) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(b_int_sf, is.na(p)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function References "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]
+[["index.html", "Tidy Survey Book Preface", " Tidy Survey Book Stephanie Zimmer, Rebecca J. Powell, and Isabella Velásquez 2023-08-07 Preface "],["c01-intro.html", "Chapter 1 Introduction", " Chapter 1 Introduction Surveys are used to gather information about a population. These are often used by researchers, government agencies, and companies to better understand public opinion and behavior. For example, researchers at a non-profit might be interested in public opinion on a given topic, government agencies may be interested in behaviors to inform policy, or companies may survey potential consumers about their interests in a given product. Developing and fielding a survey is one way that organizations can gather information on their interests. This book focuses on how to analyze the data collected from a survey. Our aim is to provide a comprehensive guide for individuals new to survey analysis but already have some background in statistics and R programming. In this book, we will discuss descriptive analysis, statistical testing, and modeling, as well as some best practices in coding and how to present results. We use real data and provide realistic examples to help you gain proficiency in survey analysis. Specially, for the purposes of this book, we assume that you have survey data that has already been collected and weighted, with the most common scenario being that you are using a public use microdata file. Microdata is data that has been released and includes individual responses to surveys. Most survey microdata distributed publicly includes analysis weights and design variables. These variables are included with the data to correctly calculate estimates. Using these weights is necessary for results to be unbiased. Accounting for the sampling design is also required to calculate correct variance estimates and test statistics. However, as we will discuss in Chapter 3, these calculations can be complex. Several general-purpose statistical software have functions to correctly account for these features in analysis, including SAS, Stata, SUDAAN, and R. This book will use R and a combination of both the {survey} and {srvyr} packages and is meant for people who already have experience in R, namely in using the tidyverse. In 2003, the {survey} package was released on CRAN and has been continuously developed over time1. This package, primarily developed by Thomas Lumley, is extensive and includes the following features: Estimates of point estimates and their associated variances, including means, totals, ratios, quantiles, and proportions Estimation of regression models, including generalized linear models, log-linear models, and survival curves Variances by Taylor linearization or by replicate weights (BRR, jackknife, bootstrap, multistage bootstrap, or user-supplied) Hypothesis testing for means, proportions, and more The {srvyr} package in R builds on the {survey} package. It provides wrappers for functions that align with the tidyverse philosophy, which is our motivation for using and recommending this package. We believe it is easy to use for R users who already use {tidyverse} packages. For example, variables to many functions in the {survey} package are passed as formulas, but in the {srvyr} package, variable names are passed using tidy select2 (a common feature in the tidyverse). Users of the tidyverse are most likely familiar with the magittr pipe (%&gt;%) which will seemlessly work with functions from the {srvyr} package. Additionally, several common functions from {dplyr} can be applied to survey objects including filter(), mutate(), and summarize(). There is one limitation to the {srvyr} package, the modeling functionality of the {survey} package is not ported over to tidy versions in the {srvyr} package. This book will use the {survey} package when discussing modeling and hypothesis testing, however, we will provide information on how to still pipe together the functions to make analyses that are easy to follow. What not to expect in this book: Survey methodology - we only provide a primer on methodology Statistical theory - we only provide basic formulas throughout Weighting - we assume you are using an analysis-ready data file with weights What to expect in this book: Chapter 2: An overview of surveys and the process of designing surveys. This is only an overview, and we include many references to get more in-depth knowledge. Chapter 3: Specifying sampling designs. Descriptions of common sampling designs, when they are used, the math behind the mean and standard error estimates, how to specify the designs in R, and examples using real data. Chapter 4: Understanding survey documentation. How to read the various components of survey documentation, working with missing data, and finding the documentation. Chapter 5: Descriptive analyses. Calculating point estimates along with their standard errors, confidence intervals, and design effects. Chapter 6: Statistical testing. Testing for differences between groups, including comparisons of means and proportions as well as goodness of fit tests, tests of independence, and tests of homogeneity. Chapter 7: Modeling. Linear regression, ANOVA, and logistic regression. Chapter 8: Communicating results. Describing results, reproducibility, making publishable tables and graphs, and helpful functions. Chapter 9: National Crime Victimization Survey Vignette. A vignette on how to analyze data from the NCVS, a survey in the US that collects information on crimes and their characteristics. This illustrates an analysis that requires multiple files to calculate victimization rates. Chapter 10: AmericasBarometer Vignette. A vignette on how to analyze data from the AmericasBarometer, a survey of attitudes, evaluations, experiences, and behavior in countries in the Western Hemisphere. This includes how to make choropleth maps with survey estimates. Most chapters include code to follow along with. Each chapter with this code will start with a set-up section. This will include all of the code needed to load packages and datasets to use in the chapter. We then provide an overview of the topic along with examples on how to use the functions. Most chapters end with exercises to work through. Solutions to these exercises can be found online at [XX]. https://cran.r-project.org/src/contrib/Archive/survey/↩︎ https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html↩︎ "],["c02-overview-surveys.html", "Chapter 2 Overview of Surveys 2.1 Pre-Survey Planning 2.2 Study Design 2.3 Data Collection 2.4 Post-Survey Processing 2.5 Post-survey data analysis and reporting", " Chapter 2 Overview of Surveys Developing surveys to gather accurate information about populations involves a more intricate and time-intensive process compared to surveys that use non-random criteria for selecting samples. Researchers can spend months, or even years, developing the study design, questions, and other methods for a single survey to ensure high-quality data is collected. While this book focuses on the analysis methods of complex surveys, understanding the entire survey life cycle can provide a better insight into what types of analyses should be conducted on the data. The survey life cycle consists of the stages required to successfully execute a survey project. Each stage influences the timing, costs, and feasibility of the survey, consequently impacting the data collected and how it should be analyzed. The survey life cycle starts with a research topic or question of interest (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs. To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail (e.g., Dillman, Smyth, and Christian 2014; Groves et al. 2009; Tourangeau, Rips, and Rasinski 2000; Bradburn, Sudman, and Wansink 2004; Valliant, Dever, and Kreuter 2013; Biemer and Lyberg 2003). 2.1 Pre-Survey Planning When starting a survey, there are multiple things to consider. Errors are the differences between the true values of the variables being studied and the values obtained through the survey. Each step and decision made before the launch of the survey can impact the types of error that are introduced into the data, which in turn impact how to interpret the results. Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement (Groves et al. 2009): Representation Sampling Error: Error produced when selecting a sample, the subset of the population, from the sampling frame, the list from which the sample is drawn (there is no sampling error if conducting a census) Coverage Error: A mismatch between the population of interest (also known as the target population or study population) and the sampling frame Nonresponse Error: Differences between those who responded and did not respond to the survey (unit nonresponse) or a given question (item nonresponse) Adjustment Error: Error introduced during post-survey statistical adjustments Measurement Validity: A mismatch between the topic of interest and the question(s) used to collect that information Measurement Error: A mismatch between what the researcher asked and how the respondent answered Processing Error: Edits by the researcher to responses provided by the respondent (e.g., adjustments to data based on illogical responses) Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the total survey error, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results (Biemer 2010). However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money: Sampling Error Tradeoff: Researchers can increase the sample size to reduce sampling error; however, larger samples can be expensive and time-consuming to find. Coverage Error Tradeoff: Researchers can search for more accurate and updated sampling frames, but they can be difficult to construct or obtain. Nonresponse Error Tradeoff: Researchers can increase or diversify efforts to improve survey participation but this may be resource-intensive while not entirely removing nonresponse bias. Adjustment Error Tradeoff: Weighting, or a statistical technique used to adjust the contribution of individual survey responses to the final survey estimates, is typically done to make the sample more representative of the target population. However, if researchers do not carefully execute the adjustments or base them on inaccurate information, they can introduce new biases, leading to less accurate estimates. Validity Error Tradeoff: Reseachers can increase validity through a variety of ways, such as extensive research, using established scales, or collaborating with a psychometrician during survey design. However, doing so lengthens the amount of time and resources needed to complete survey design. Measurement Error Tradeoff: Reseachers can use techniques such as questionnaire testing and cognitive interviewing to ensure respondents are answering questions as expected. However, these activities also require time and resources to complete. Processing Error: Researchers can impose rigorous data cleaning and validation processes. However, this requires supervision, training, and time. The challenge for survey researchers is to find the optimal tradeoffs among these errors. They must carefully consider ways to reduce each error source and total survey error as a whole while balancing their study’s objectives and resources. For survey analysts, understanding decisions that researchers took to minimize these error sources can impact how results are interpreted. The remainder of this chapter dives into key considerations for survey development. We explore how to consider each of these sources of error, and how these error sources can inform the interpretations of the data. 2.2 Study Design From formulating methodologies to choosing appropriate sampling frame, the study design phase is where the blueprint for a successful survey takes shape. Study design encompasses multiple parts of the survey life cycle, including decisions on the population of interest, survey mode (the format through which a survey is administered to respondents), timeline, and questionnaire design. Knowing who and how to survey individuals depends on the study’s goals and the feasibility of implementation. This section explores the strategic planning that lays the foundation for a survey. 2.2.1 Sampling Design Who we want to survey is known as the population of interest. The population of interest could be broad, such as “all adults age 18+ living in the U.S.” or a specific population based on a particular characteristic or location. For example, we may want to know about “adults age 18-24 who live in North Carolina” or “eligible voters living in Illinois.” However, to survey individuals in these populations of interest, a sampling frame is needed with contact information. If researchers are looking at eligible voters, the sampling frame could be the voting registry for a given state or area. The sampling frame will most likely be imperfect for more broad target populations like all adults in the United States. In these cases, researchers may choose to use a sampling frame of mailing addresses and send the survey to households, or they may choose to use random digit dialing (RDD) and call random phone numbers (that may or may not be assigned, connected, and working). These imperfect sampling frames can result in coverage error where there is a mismatch between the target population and the list of individuals researchers can select. For example, if a researcher is looking to obtain estimates for “all adults age 18+ living in the U.S.”, using a sampling frame from mailing addresses will be missing specific types of individuals, such as the homeless, transient populations, and incarcerated individuals. Additionally, many households have more than one adult living there, so researchers would need to consider how to get a specific individual to fill out the survey (called within household selection) or adjust the target population to report on “U.S. households” instead of “individuals.” Once the researchers have selected the sampling frame, the next step is determining how to select individuals for the survey. In rare cases, researchers may wish to conduct a census and survey everyone on the sampling frame. However, the ability to implement a questionnaire at that scale is something only some can do (e.g., government censuses). Instead, researchers choose to sample individuals and use weights to estimate numbers in the target population. There are a variety of different sampling methods that can be used, and more information on these can be found in Chapter 3. This decision of which sampling method to use impacts sampling error and can be accounted for in weighting. Example: Number of Pets in a Household Let’s use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let’s assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households (Harter et al. 2016). To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state). Throughout this chapter, we will build on this example research question to plan a survey. 2.2.2 Data Collection Planning With the sampling design decided, researchers can then decide on how to survey these individuals. Specifically, the modes used for contacting and surveying the sample, how frequently to send reminders and follow-ups, and the overall timeline of the study are four of the major data collection determinations. Traditionally, researchers have considered four main modes3. For the purposes of this overview we will focus on these four main modes for conducting surveys: Computer Assisted Personal Interview (CAPI; also known as face-to-face or in-person interviewing) Computer Assisted Telephone Interview (CATI; also known as phone or telephone interviewing) Computer Assisted Web Interview (CAWI; also known as web or on-line interviewing) Paper and Pencil Interview (PAPI) Researchers can use a single mode to collect data or multiple modes (also called mixed modes). Using mixed modes can allow for broader reach and increase response rates depending on the target population (DeLeeuw 2005, 2018; Biemer et al. 2017). For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis. When selecting which mode, or modes, to use, understanding the unique aspects of the chosen target population and sampling frame will provide insight into how they can best be reached and engaged. For example, if we plan to survey adults aged 18-24 who live in North Carolina, asking them to complete a survey using CATI (i.e., over the phone) would most likely not be as successful as other modes like the web. This age group does not talk on the phone as much as other generations, and often do not answer their phones for unknown numbers. Additionally, the mode for contacting respondents relies on what information is available on the sampling frame. For example, if our sampling frame includes an email address, we could email our selected sample members to convince them to complete a survey. Or if the sampling frame is a list of mailing addresses, researchers would have to contact sample members with a letter. It is important to note that there can be a difference between the contact and survey modes. For example, if we have a sampling frame with addresses, we can send a letter to our sample members and provide information on how to complete a web survey. Or we could use mixed-mode surveys and send sample members a paper and pencil survey with our letter and also ask them to complete the survey online. Combining different contact modes and different survey modes can be useful in reducing unit nonresponse error–where the entire unit (e.g., a household) does not respond to the survey at all–as different sample members may respond better to different contact and survey modes. However, when considering which modes to use, it is important to make access to the survey as easy as possible for sample members to reduce burden and unit nonresponse. Another way to reduce unit nonresponse error is through varying the language of the contact materials (Dillman, Smyth, and Christian 2014). People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke “urgent” or “important” thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL. A study timeline may also determine the number and types of contacts. If the timeline is long, then there is a lot of time for follow-ups and varying the message in contact materials. If the timeline is short, then fewer follow-ups can be implemented. Many studies will start with the tailored design method put forth by Dillman, Smyth, and Christian (2014) and implement 5 contacts: Prenotice letting sample members know the survey is coming Invitation to complete the survey Reminder that also thanks respondents that may have already completed the survey Reminder (with a replacement paper survey if needed) Final reminder This method is easily adaptable based on the study timeline and needs but provides a sound starting point for most studies. Example: Number of Pets in a Household Let’s return to our example of a researcher who wants to know the average number of pets in a household. We are using a sampling frame of mailing addresses, so we recommend starting our data collection with letters mailed to households, but later in data collection, we want to send interviewers to the house to conduct an in-person (or CAPI) interview to decrease unit nonresponse error. This means we will have two contact modes (paper and in-person). As mentioned above, the survey mode does not have to be the same as the contact mode, so we recommend a mixed-mode study with both Web and CAPI modes. Let’s assume we have six months for data collection, so we may want to recommend the following protocol: Protocol Example for 6-month Web and CAPI Data Collection Week Contact Mode Contact Message Survey Mode Offered 1 Mail: Letter Prenotice — 2 Mail: Letter Invitation Web 3 Mail: Postcard Thank You/Reminder Web 6 Mail: Letter in large envelope Animal Welfare Discussion Web 10 Mail: Postcard Inform Upcoming In-Person Visit Web 14 In-Person Visit — CAPI 16 Mail: Letter Reminder of In-Person Visit Web, but includes a number to call to schedule CAPI 20 In-Person Visit — CAPI 25 Mail: Letter in large envelope Survey Closing Notice Web, but includes a number to call to schedule CAPI This is just one possible protocol that we can use that starts respondents with web (typically done to reduce costs). However, researchers may want to begin in-person data collection earlier during the data collection period or ask their interviewers to attempt more than two visits with a household. 2.2.3 Questionnaire Design When developing the questionnaire, it can be helpful to first outline the topics to be asked and include the “why” each question or topic is important to the research question(s). This can help researchers better tailor the questionnaire and potentially reduce the number of questions (and thus the burden on the respondent) if topics are deemed irrelevant to the research question. When making these decisions, researchers should also consider questions needed for weighting. While we would love to have everyone sampled answer our survey, this is rarely the case. Thus, including questions about demographics in the survey can assist with weighting for nonresponse errors (both unit and item nonresponse). Knowing the details of the sampling plan and what may impact coverage error and sampling error can help researchers determine what types of demographics to include. Researchers can benefit from the work of others by using questions from other surveys. Demographic questions such as race, ethnicity, or education often use questions from a government census or other official surveys. Other survey questions can be found using question banks which are a compilation of questions that have been asked across various surveys such as the Inter-university Consortium for Political and Social Research (ICPSR) variable search. If a question does not exist in a question bank, researchers can craft their own. When creating their own questions, researchers should start with the research question or topic and attempt to write questions that match the concept. The closer the question asked is to the overall concept, the better validity there is. For example, if the researcher wants to know how people consume TV series and movies but only asks a question about how many TVs are in the house, then they would be missing other ways that people watch TV series and movies, such as on other devices or at places outside of the home. As mentioned above, researchers can employ techniques to increase the validity of their questionnaire. For example, questionnaire testing involves conducting a pilot of the survey instrument to identify and fix potential issues before the main survey is conducted. Cognitive interviewing is a technique where researchers walk through the survey with participants, encouraging them to speak their thoughts out loud to uncover how they interpret and understand survey questions. Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in measurement error, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes (e.g., Dillman, Smyth, and Christian 2014; Fowler and Mangione 1989; Bradburn, Sudman, and Wansink 2004; Tourangeau, Couper, and Conrad 2004). Example: Number of Pets in a Household As part of our survey on the average number of pets in a household, researchers may want to know what animal most people prefer to have as a pet. Let’s say we have the following question in our survey: What animal do you prefer to have as a pet? Dogs Cats This question may have validity issues as it only provides the options of “dogs” and “cats” to respondents, and interpretation of the data could be incorrect. For example, if we had 100 respondents who answered the question and 50 selected dogs, then the results of this question cannot be “50% of the population prefers to have a dog as a pet” as only two response options were provided. If a respondent taking our survey prefers turtles, they could either be forced to choose a response between these two (i.e., interpret the question as “between dogs and cats, which do you prefer?” and result in measurement error), or they may not answer the question (which results in item nonresponse error). Based on this, the interpretation of this question should be “When given a choice between dogs and cats, 50% of respondents preferred to have a dog as a pet.” To avoid this issue, researchers should consider these possibilities and adjust the question accordingly. One simple way could be to add an “other” response option to give respondents a chance to provide a different response. The “other” response option could then include a way for respondents to write in what their other preference is. For example, this question could be rewritten as What animal do you prefer to have as a pet? Dogs Cats Other, please specify: Researchers can then code the responses from the open-ended box and get a better understanding of the respondent’s choice of preferred pet. Interpreting this question becomes easier as researchers no longer need to qualify the results with the choices provided. This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter 4 provides further details on how to review existing survey documentation to inform our analyses. 2.3 Data Collection Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection (Schouten, Peytchev, and Wagner 2018). Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group. 2.4 Post-Survey Processing After data collection, a variety of activities need to be conducted before we can analyze the survey. Multiple decisions made during this post-survey phase can assist researchers in reducing different error sources, such as through weighting to account for the sample selection. Knowing the decisions researchers made in creating the final analytic data can impact how analysts use the data and interpret the results. 2.4.1 Data Cleaning and Imputation Post-survey cleaning and imputation is one of the first steps researchers will do to get the survey responses into a dataset for use by analysts. Data cleaning can consist of cleaning inconsistent data (e.g., with skip pattern errors or multiple questions throughout the survey being consistent with each other), editing numeric entries or open-ended responses for grammar and consistency, or recoding open-ended questions into categories for analysis. There is no universal set of fixed rules that every project must adhere to. Instead, each project or research study should establish its own guidelines and procedures for handling various cleaning scenarios based on its specific objectives. Researchers should use their best judgment to ensure data integrity, and all decisions should be documented and available to those using the data in the analysis. Each decision a researcher makes impacts processing error, so often researchers will have multiple people review these rules or recode open-ended data and adjudicate any differences in an attempt to reduce this error. Another crucial step in post-survey processing is imputation. Often, there is item nonresponse where respondents do not answer specific questions. If the questions are crucial to analysis efforts or the research question, researchers may implement imputation in an effort to reduce item nonresponse error. Imputation is a technique for replacing missing or incomplete data values with estimated values. However, as imputation is a way of assigning a value to missing data based on an algorithm or model, it can also introduce processing error, so researchers should consider the overall implications of imputing data compared to having item nonresponse. There are multiple ways imputation can be conducted. We recommend reviewing other resources like Kim and Shao (2021) for more information. Example: Number of Pets in a Household Let’s return to the question we created to ask about animal preference. The “other specify” invites respondents to specify the type of animal they prefer to have as a pet. If respondents entered answers such as “puppy,” “turtle,” “rabit,” “rabbit,” “bunny,” “ant farm,” “snake,” “Mr. Purr,” then researchers may wish to categorize these write-in responses to help with analysis. In this example, “puppy” could be assumed to be a reference to a “Dog”, and could be recoded there. The misspelling of “rabit” could be coded along with “rabbit” and “bunny” into a single category of “Bunny or Rabbit”. These are relatively standard decisions that a researcher could make. The remaining write-in responses could be categorized in a few different ways. “Mr. Purr,” which may be someone’s reference to their own cat, could be recoded as “Cat”, or it could remain as “Other” or some category that is “Unknown”. Depending on the number of responses related to each of the others, they could all be combined into a single “Other” category, or maybe categories such as “Reptiles” or “Insects” could be created. Each of these decisions may impact the interpretation of the data, so our researcher should document the types of responses that fall into each of the new categories and any decisions made. 2.4.2 Weighting Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an “analysis weight” variable that combines these adjustments. However, weighting itself can also introduce adjustment error, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own (Valliant and Dever 2018). Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter 4) and work with the data and analysis weights provided to analyze and interpret survey results correctly. Example: Number of Pets in a Household In the simple example of our survey, we decided to use a stratified sample by state to select our sample members. Knowing this sampling design, our researcher can include selection weights for analysis that account for how the sample members were selected for the survey. Additionally, the sampling frame may have the type of building associated with each address, so we could include the building type as a potential nonresponse weighting variable, along with some interviewer observations that may be related to our research topic of the average number of pets in a household. Combining these weights, we can create an analytic weight that researchers will need to use when analyzing the data. 2.4.3 Disclosure Before data is made publicly available, researchers will need to ensure that individual respondents can not be identified by the data when confidentiality is required. There are a variety of different methods that can be used, including data swapping, top or bottom coding, coarsening, and perturbation. In data swapping, researchers may swap specific data values across different respondents so that it does not impact insights from the data but ensures that specific individuals cannot be identified. For extreme values, top and bottom coding is sometimes used. For example, researchers may top-code income values such that households with income greater than $99,999,999 are coded into a single category of $99,999,999 or more. Other disclosure methods may include aggregating response categories or location information to avoid having only a few respondents in a given group and thus be identified. For example, researchers may use coarsening to display income in categories instead of as a continuous variable. Data producers may also perturb the data by adding random noise. There is as much art as there is a science to the methods used for disclosure, and in documentation, researchers should only provide high-level comments that disclosure was conducted and not specific details to ensure nobody can reverse the disclosure and thus identify individuals. For more information on different disclosure methods, please see Skinner (2009) and AAPOR Standards. 2.4.4 Documentation Documentation is a critical step of the survey life cycle. Researchers systematically record all the details, decisions, procedures, and methodologies to ensure transparency, reproducibility, and the overall quality of survey research. Proper documentation allows analysts to understand, reproduce, and evaluate the study’s methods and findings. Chapter 4 dives into how analysts should use survey data documentation. 2.5 Post-survey data analysis and reporting After completing the survey life cycle, the data is ready for analysts to use. The rest of this book continues from this point. If you’re interested in learning more about the steps discussed here, you can explore the references cited throughout this chapter. References "],["c03-specifying-sample-designs.html", "Chapter 3 Specifying sample designs and replicate weights in {srvyr} 3.1 Introduction 3.2 Common sampling designs 3.3 Replicate weights 3.4 Understanding survey design documentation 3.5 Exercises", " Chapter 3 Specifying sample designs and replicate weights in {srvyr} Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(tidycensus) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) To help explain the different types of sample designs, this chapter will use the api and scd data that comes in the {survey} package: data(api) data(scd) Additionally, we have created multiple analytic datasets for use in this book on a directory on OSF4. To load any data used in the book that is not included in existing packages, we have created a helper function read_osf(). This chapter uses data from the Residential Energy Consumption Survey (RECS), so we will use the following code to load the RECS data to use later in this chapter: recs_in &lt;- read_osf(&quot;recs_2015.rds&quot;) 3.1 Introduction The primary reason for using packages like {survey} and {srvyr} is to incorporate the sampling design or replicate weights into estimates. By incorporating the sampling design or replicate weights, precision estimates (e.g., standard errors and confidence intervals) are appropriately calculated. In this chapter, we will introduce common sampling designs and common types of replicate weights, the mathematical methods for calculating estimates and standard errors for a given sampling design, and the R syntax to specify the sampling design or replicate weights. While we will show the math behind the estimates, the functions in these packages will do the calculation. To deeply understand the math and the derivation, refer to Särndal, Swensson, and Wretman (2003), Wolter (2007), or Fuller (2011). The general process for estimation in the {srvyr} package is to: Create a tbl_svy object (a survey object) using: as_survey_design() or as_survey_rep() Subset data (if needed) using filter() (subpopulations) Specify domains of analysis using group_by() Within summarize(), specify variables to calculate, including means, totals, proportions, quantiles, and more This chapter includes details on the first step - creating the survey object. The other steps are detailed in the next several chapters. 3.2 Common sampling designs A sampling design is the method used to draw a sample. Both logistical and statistical elements are considered when developing a sampling design. When specifying a sampling design in R, the levels of sampling are specified along with the weights. Each record of a weight is constructed so that the particular record represents that many units in the population. For example, in a survey of 6th-grade students in the United States, the weight associated with each responding student reflects how many students that record represents. Generally, the sum of the weights corresponds to the total population size, although some studies may have the sum of the weights equal to the number of respondent records. Some common terminology across the designs are: sample size, generally denoted as \\(n\\), is the number of units selected to be sampled population size, generally denoted as \\(N\\), is the number of units in the target population sampling frame, the list of units from which the sample is drawn 3.2.1 Simple random sample without replacement The simple random sample (SRS) without replacement is a sampling design where a fixed sample size is selected from a sampling frame, and every possible subsample has an equal probability of selection. Requirements: The sampling frame must include the entire population. Advantages: SRS requires no information about the units apart from contact information. Disadvantages: The sampling frame may not be available for the entire population. This design is not generally feasible for in-person data collection. Example: Randomly select students in a university from a roster provided by the registrar’s office. The math The estimate for the population mean of variable \\(y\\) is: \\[\\bar{y}=\\frac{1}{n}\\sum_{i=1}^n y_i\\] where \\(\\bar{y}\\) represents the sample mean, \\(n\\) is the total number of respondents (or observations), and \\(y_i\\) is each individual value of \\(y\\). The estimate of the standard error of the mean is: \\[se(\\bar{y})=\\sqrt{\\frac{s^2}{n}\\left( 1-\\frac{n}{N} \\right)}\\] where \\[s^2=\\frac{1}{n-1}\\sum_{i=1}^n\\left(y_i-\\bar{y}\\right)^2.\\] and \\(N\\) is the population total. This standard error estimate might look very similar to equations in other applications except for the part on the right side of the equation: \\(1-\\frac{n}{N}\\). This is called the finite population correction factor (FPC), and if the size of the frame, \\(N\\), is very large, the FPC is negligible, so it is often ignored. To estimate proportions, we define \\(x_i\\) as the indicator if the outcome is observed. That is, \\(x_i=1\\) if the outcome is observed, and \\(x_i=0\\) if the outcome is not observed. Then the estimated proportion from an SRS design is: \\[\\hat{p}=\\frac{1}{n}\\sum_{i=1}^n x_i \\] and the estimated standard error of the proportion is: \\[se(\\hat{p})=\\sqrt{\\frac{\\hat{p}(1-\\hat{p})}{n-1}\\left(1-\\frac{n}{N}\\right)} \\] The syntax If a sample was drawn through SRS and had no nonresponse or other weighting adjustments, in R, specify this design as: srs1_des &lt;- dat %&gt;% as_survey_design(fpc = fpcvar) where dat is a tibble or data.frame with the survey data, and fpcvar is a variable on the tibble indicating the sampling frame’s size. If the frame is very large, sometimes the frame size is not provided. In that case, the FPC is not needed, and specify the design as: srs2_des &lt;- dat %&gt;% as_survey_design() If some post-survey adjustments were implemented and the weights are not all equal, specify the design as: srs3_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, fpc = fpcvar) where wtvar is the variable for the weight on the data. Again, the FPC can be omitted if it is unnecessary because the frame is large. Example The {survey} package in R provides some example datasets that we will use throughout this chapter. The documentation provides detailed information about the variables. One of the example datasets we will use is from the Academic Performance Index (API). The API was a program administered by the California Department of Education, and the {survey} package includes a population file (sample frame) of all schools with at least 100 students and several different samples pulled from that data using different sampling methods. For this first example, we will use the apisrs dataset, which contains an SRS of 200 schools. For printing purposes, we create a new dataset called apisrs_slim, which sorts the data by the school district and school ID and subsets the data to only a few columns. The SRS sample data is illustrated below: apisrs_slim &lt;- apisrs %&gt;% as_tibble() %&gt;% arrange(dnum, snum) %&gt;% select(cds, dnum, snum, dname, sname, fpc, pw) apisrs_slim ## # A tibble: 200 × 7 ## cds dnum snum dname sname fpc pw ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 19642126061220 1 1121 ABC Unified Haske… 6194 31.0 ## 2 19642126066716 1 1124 ABC Unified Stowe… 6194 31.0 ## 3 36675876035174 5 3895 Adelanto Elementary Adela… 6194 31.0 ## 4 33669776031512 19 3347 Alvord Unified Arlan… 6194 31.0 ## 5 33669776031595 19 3352 Alvord Unified Wells… 6194 31.0 ## 6 31667876031033 39 3271 Auburn Union Elementary Cain … 6194 31.0 ## 7 19642876011407 42 1169 Baldwin Park Unified Deanz… 6194 31.0 ## 8 19642876011464 42 1175 Baldwin Park Unified Heath… 6194 31.0 ## 9 19642956011589 48 1187 Bassett Unified Erwin… 6194 31.0 ## 10 41688586043392 49 4948 Bayshore Elementary Baysh… 6194 31.0 ## # ℹ 190 more rows Table 3.1 provides details on all the variables in this dataset. TABLE 3.1: Overview of Variables in api Data Variable Name Description cds Unique identifier for each school dnum School district identifier within county snum School identifier within district dname District Name sname School Name fpc Finite population correction factor (FPC) pw Weight To create the tbl_survey object for this SRS data, the design should be specified as follows: apisrs_des &lt;- apisrs_slim %&gt;% as_survey_design(weights = pw, fpc = fpc) apisrs_des ## Independent Sampling design ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - fpc: fpc ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), fpc (dbl), pw (dbl) In the printed design object above, the design is described as an “Independent Sampling design,” which is another term for SRS. The ids are specified as 1, which means there is no clustering (a topic described later in this chapter), the FPC variable is indicated, and the weights are indicated. We can also look at the summary of the design object, and see the distribution of the probabilities (inverse of the weights) along with the population size and a list of the variables in the dataset. summary(apisrs_des) ## Independent Sampling design ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0323 0.0323 0.0323 0.0323 0.0323 0.0323 ## Population size (PSUs): 6194 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;fpc&quot; &quot;pw&quot; 3.2.2 Simple random sample with replacement Similar to the SRS design, the simple random sample with replacement (SRSWR) design randomly selects the sample from the entire sampling frame. However, while SRS removes sampled units before selecting again, the SRSWR instead replaces each sampled unit before drawing again, so units can be selected more than once. Requirements: The sampling frame must include the entire population. Advantages: SRSWR requires no information about the units apart from contact information. Disadvantages: The sampling frame may not be available for the entire population. This design is not generally feasible for in-person data collection. Units can be selected more than once, resulting in a smaller realized sample size as receiving the duplicate information from a single respondent does not provide additional information. For small populations, SRSWR has larger standard errors than SRS designs. Example: A professor puts all students’ names on paper slips and selects them randomly to ask students questions, but the professor replaces the paper after calling on the student so they can be selected again at any time. The math The estimate for the population mean of variable \\(y\\) is: \\[\\bar{y}=\\frac{1}{n}\\sum_{i=1}^n y_i\\] and the estimate of the standard error of mean is: \\[se(\\bar{y})=\\sqrt{\\frac{s^2}{n}}\\] where \\[s^2=\\frac{1}{n-1}\\sum_{i=1}^n\\left(y_i-\\bar{y}\\right)^2.\\] To calculate the estimated proportion, we define \\(x_i\\) as the indicator that the outcome is observed (as we did with SRS): \\[\\hat{p}=\\frac{1}{n}\\sum_{i=1}^n x_i \\] and the estimated standard error of the proportion is: \\[se(\\hat{p})=\\sqrt{\\frac{\\hat{p}(1-\\hat{p})}{n}} \\] The syntax If you had a sample that was drawn through SRSWR and had no nonresponse or other weighting adjustments, in R, you should specify this design as: srswr1_des &lt;- dat %&gt;% as_survey_design() where dat is a tibble or data.frame containing your survey data. This syntax is the same as a SRS design without an FPC. Therefore, with large enough samples that do not have an FPC, the underlying formulas for SRS and SRSWR designs are the same. If some post-survey adjustments were implemented and the weights are not all equal, specify the design as: srswr2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar) where wtvar is the variable for the weight on the data. Example The {survey} package does not include an example of SRSWR, so to illustrate this design we create an example from the population data provided. We call this new dataset apisrswr. set.seed(409963) apisrswr &lt;- apipop %&gt;% as_tibble() %&gt;% slice_sample(n = 200, replace = TRUE) %&gt;% select(cds, dnum, snum, dname, sname) %&gt;% mutate( weight = nrow(apipop)/200 ) head(apisrswr) ## # A tibble: 6 × 6 ## cds dnum snum dname sname weight ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 43696416060065 533 5348 Palo Alto Unified Jordan (Da… 31.0 ## 2 07618046005060 650 509 San Ramon Valley Unified Alamo Elem… 31.0 ## 3 19648086085674 457 2134 Montebello Unified La Merced … 31.0 ## 4 07617056003719 346 377 Knightsen Elementary Knightsen … 31.0 ## 5 19650606023022 744 2351 Torrance Unified Carr (Evel… 31.0 ## 6 01611196090120 6 13 Alameda City Unified Paden (Wil… 31.0 Because this is a SRS design with replacement, there will be duplicates in the data. It is important to keep the duplicates in the data for proper estimation, but for reference we can view the duplicates in the example data we just created. apisrswr %&gt;% group_by(cds) %&gt;% filter(n()&gt;1) %&gt;% arrange(cds) ## # A tibble: 4 × 6 ## # Groups: cds [2] ## cds dnum snum dname sname weight ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 15633216008841 41 869 Bakersfield City Elem Chipman Junio… 31.0 ## 2 15633216008841 41 869 Bakersfield City Elem Chipman Junio… 31.0 ## 3 39686766042782 716 4880 Stockton City Unified Tyler Skills … 31.0 ## 4 39686766042782 716 4880 Stockton City Unified Tyler Skills … 31.0 We created a weight variable in this example data, which is the inverse of the probability of selection. To specify the sampling design for apisrswr, the following syntax should be used: apisrswr_des &lt;- apisrswr %&gt;% as_survey_design(weights = weight) apisrswr_des ## Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - weights: weight ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), weight (dbl) summary(apisrswr_des) ## Independent Sampling design (with replacement) ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0323 0.0323 0.0323 0.0323 0.0323 0.0323 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;weight&quot; In the chunk above, the design object is printed, and the object summary is shown. Both note that the sampling is done “with replacement” because no FPC was specified. The probabilities, which are derived from the weights, are summarized in the summary. 3.2.3 Stratified sampling A population is divided into mutually exclusive subpopulations (strata), and then samples are selected independently within each stratum. Requirements: The sampling frame must include the information to divide the population into groups for every unit. Advantages: This design ensures sample representation in all subpopulations. If the strata are correlated with survey outcomes, a stratified sample has smaller standard errors compared to a SRS sample of the same size. Thus is a more efficient design. Disadvantages: Auxiliary data may not exist to divide the sampling frame into groups, or the data may be outdated. Examples: Example 1: A population of North Carolina residents could be separated into urban and rural areas, and then a SRS of residents from both rural and urban areas is selected independently. This ensures there are residents from both areas in the sample. Example 2: There are three primary general-purpose law enforcement agencies in the US: local police, sheriff’s departments, and state police. In a survey of law enforcement agencies, the agency type could be used to form strata. The math Let \\(\\bar{y}_h\\) be the sample mean for stratum \\(h\\), \\(N_h\\) be the population size of stratum \\(h\\), and \\(n_h\\) be the sample size of stratum \\(h\\). Then the estimate for the population mean under stratified SRS sampling is: \\[\\bar{y}=\\frac{1}{N}\\sum_{h=1}^H N_h\\bar{y}_h\\] and the estimate of the standard error of \\(\\bar{y}\\) is: \\[se(\\bar{y})=\\sqrt{\\frac{1}{N^2} \\sum_{h=1}^H N_h^2 s_h^2\\left(1-\\frac{n_h}{N_h}\\right)} \\] where \\[s_h^2=\\frac{1}{n_h-1}\\sum_{i=1}^{n_h}\\left(y_{i,h}-\\bar{y}_h\\right)^2.\\] For estimates of proportions, let \\(\\hat{p}_h\\) be the estimated proportion in stratum \\(h\\). Then the population proportion estimate is: \\[\\hat{p}= \\frac{1}{N}\\sum_{h=1}^H N_h \\hat{p}_h\\] where \\(H\\) is the total number of clusters. The standard error of the proportion is: \\[se(\\hat{p}) = \\frac{1}{N} \\sqrt{ \\sum_{h=1}^H N_h^2 \\frac{\\hat{p}_h(1-\\hat{p}_h)}{n_h-1} \\left(1-\\frac{n_h}{N_h}\\right)}\\] The syntax To specify a stratified SRS design in {srvyr} when using the FPC, that is, where the population sizes of the strata are not too large and are known, that is, you are using the FPC, specify the design as: stsrs1_des &lt;- dat %&gt;% as_survey_design(fpc = fpcvar, strata = stratvar) where fpcvar is a variable on your data that indicates \\(N_h\\) for each row, and stratavar is a variable indicating the stratum for each row. You can omit the FPC if it is not applicable. Additionally, you can indicate the weight variable if it is present where wtvar is a variable on your data with a numeric weight. stsrs2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, strata = stratvar) Example In the example API data, apistrat is a stratified random sample, stratified by school type (stype). As with the SRS example above, we sort and select specific variables for use in printing. The data are illustrated below, including a count of the number of cases per stratum: apistrat_slim &lt;- apistrat %&gt;% as_tibble() %&gt;% arrange(dnum, snum) %&gt;% select(cds, dnum, snum, dname, sname, stype, fpc, pw) apistrat_slim %&gt;% count(stype, fpc) ## # A tibble: 3 × 3 ## stype fpc n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 E 4421 100 ## 2 H 755 50 ## 3 M 1018 50 The FPC is the same within each stratum, and 100 elementary schools were sampled, while 50 schools were sampled from both the middle and high school levels. This design should be specified as follows: apistrat_des &lt;- apistrat_slim %&gt;% as_survey_design(strata = stype, weights = pw, fpc = fpc) apistrat_des ## Stratified Independent Sampling design ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - fpc: fpc ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), dname (chr), sname ## (chr), stype (fct), fpc (dbl), pw (dbl) summary(apistrat_des) ## Stratified Independent Sampling design ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0226 0.0226 0.0359 0.0401 0.0534 0.0662 ## Stratum Sizes: ## E H M ## obs 100 50 50 ## design.PSU 100 50 50 ## actual.PSU 100 50 50 ## Population stratum sizes (PSUs): ## E H M ## 4421 755 1018 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;dname&quot; &quot;sname&quot; &quot;stype&quot; &quot;fpc&quot; &quot;pw&quot; When printing the object, it is specified as a “Stratified Independent Sampling design,” also known as a stratified SRS, and the strata variable is included. Printing the summary we see a distribution of probabilities, as we saw with SRS, but we also see the sample and populations sizes by stratum. 3.2.4 Clustered sampling A population is divided into mutually exclusive subgroups called clusters or primary sampling units (PSUs). A random selection of PSUs is sampled, and then another level of sampling is done within these clusters. There can be multiple levels of this selection. Clustered sampling is often used when a list of the entire population is not available, or data collection involves interviewers needing direct contact with respondents. Requirements: There must be a way to divide the population into clusters. Clusters are commonly structural such as institutions (e.g., schools, prisons) or geography (e.g., states, counties). Advantages: Clustered sampling is advantageous when data collection is done in person, so interviewers are sent to specific sampled areas rather than completely at random across a country. With cluster sampling, a list of the entire population is not necessary. For example, if sampling students, you do not need a list of all students but only a list of all schools. Once the schools are sampled, lists of students can be obtained within the sampled schools. Disadvantages: Compared to a simple random sample for the same sample size, clustered samples generally have larger standard errors of estimates. Examples: Example 1: Consider a study needing a sample of 6th-grade students in the United States, no list likely exists of all these students. However, it is more likely to obtain a list of schools that have 6th graders, so a study design could select a random sample of schools that have 6th graders. The selected schools can then provide a list of students to do a second stage of sampling where 6th-grade students are randomly sampled within each of the sampled schools. This is a one-stage sample design and will be the type of design we will discuss in the formulas below. Example 2: Consider a study sending interviewers to households for a survey. This is a more complicated example that requires two levels of selection to efficiently use interviewers in geographic clusters. First, in the U.S., counties could be selected as the PSU, then Census block groups within counties could be selected as the secondary sampling unit (SSU). Households could then be randomly sampled within the block groups. This type of design is popular for in-person surveys as it reduces the travel necessary for interviewers. The math Consider a population where there are \\(N\\) clusters and \\(n\\) clusters are sampled via SRS. Units within each sampled cluster are sampled via SRS as well. Let \\(M_i\\) be the number of units in cluster \\(i\\) and \\(\\bar{y}_i\\) be the sample mean of cluster \\(i\\). Then, a ratio estimator of the population mean is: \\[\\bar{y}=\\frac{\\sum_{i=1}^n M_i \\bar{y}_i}{ \\sum_{i=1}^n M_i}\\] Note this is a consistent but biased estimator. Often the population size is not known, so this is a method to estimate a mean without knowing the population size. The estimated standard error of the mean is: \\[se(\\bar{y})=\\frac{1}{\\hat{N}_{pop} } \\sqrt{\\frac{N^2 (1-\\frac{n}{N})}{n}\\frac{1}{n-1} \\sum_{i=1}^n (M_i\\bar{y}_i -\\hat{t}/N)^2 + \\frac{N}{n} \\sum_{i=1}^n \\frac{M_i^2}{m_i}\\left(1-\\frac{m_i}{M_i}\\right)s^2_i }\\] where \\(\\hat{N}_{pop}\\) is the estimated population size, \\(\\hat{t}\\) is the estimated total, and \\(s_i^2\\) is the sample variance of cluster \\(i\\). For estimates of proportions, the estimated proportion is: \\[\\hat{p}=\\frac{\\sum_{i=1}^n M_i \\hat{p}_i}{ \\sum_{i=1}^n M_i}\\] and the associated standard error estimate is: \\[se(\\hat{p})=\\frac{1}{\\hat{N}_{pop} } \\sqrt{\\frac{N^2 (1-\\frac{n}{N})}{n}\\frac{1}{n-1} \\sum_{i=1}^n (M_i\\hat{p}_i -\\hat{t}/N)^2 + \\frac{N}{n} \\sum_{i=1}^n \\frac{M_i^2}{m_i}\\left(1-\\frac{m_i}{M_i}\\right)s^2_i }\\] where \\(s^2_i\\) is defined as: \\[s^2_i = \\frac{m_hp_h(1-p_h)}{m_h-1}\\]. The syntax To specify a two-stage clustered design without replacement, use the following syntax: clus2_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = c(PSU, SSU), fpc = c(N, M)) where PSU and SSU are the variables indicating the PSU and SSU identifiers, and N and M are the variables indicating the population sizes for each level (i.e., N is the number of clusters, and M is the number of units within each cluster). Note that N will be the same for all records (within a strata), and M will be the same for all records within the same cluster. If clusters were sampled with replacement or from a very large population, a FPC is unnecessary. Additionally, only the first stage of selection is necessary regardless of whether the units were selected with replacement at any stage. The subsequent stages of selection are ignored in computation as their contribution to the variance is overpowered by the first stage (see Särndal, Swensson, and Wretman (2003) or Wolter (2007) for a more in-depth discussion). Therefore, the syntax below will yield the same estimates in the end: clus2wra_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = c(PSU, SSU)) clus2wrb_des &lt;- dat %&gt;% as_survey_design(weights = wtvar, ids = PSU) Example The survey package includes a two-stage cluster sample data, apiclus2, in which school districts were sampled, and then a random sample of five schools was selected within each district. For districts with fewer than five schools, all schools were sampled. School districts are identified by dnum, and schools are identified by snum. The variable fpc1 indicates how many districts there are in California (N), and fpc2 indicates how many schools were in a given district with at least 100 students (M). The data has a row for each school. In the data printed below, there are 757 school districts, as indicated by fpc1, and there are nine schools in District 731, one school in District 742, two schools in District 768, and so on as indicated by fpc2. For illustration purposes, the object apiclus2_slim has been created from apiclus2, which subsets the data to only the necessary columns and sorts data. apiclus2_slim &lt;- apiclus2 %&gt;% as_tibble() %&gt;% arrange(desc(dnum), snum) %&gt;% select(cds, dnum, snum, fpc1, fpc2, pw) apiclus2_slim ## # A tibble: 126 × 6 ## cds dnum snum fpc1 fpc2 pw ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int[1d]&gt; &lt;dbl&gt; ## 1 47704826050942 795 5552 757 1 18.9 ## 2 07618126005169 781 530 757 6 22.7 ## 3 07618126005177 781 531 757 6 22.7 ## 4 07618126005185 781 532 757 6 22.7 ## 5 07618126005193 781 533 757 6 22.7 ## 6 07618126005243 781 535 757 6 22.7 ## 7 19650786023337 768 2371 757 2 18.9 ## 8 19650786023345 768 2372 757 2 18.9 ## 9 54722076054423 742 5898 757 1 18.9 ## 10 50712906053086 731 5781 757 9 34.1 ## # ℹ 116 more rows To specify this design in R, the following syntax should be used: apiclus2_des &lt;- apiclus2_slim %&gt;% as_survey_design( ids = c(dnum, snum), fpc = c(fpc1, fpc2), weights = pw ) apiclus2_des ## 2 - level Cluster Sampling design ## With (40, 126) clusters. ## Called via srvyr ## Sampling variables: ## - ids: `dnum + snum` ## - fpc: `fpc1 + fpc2` ## - weights: pw ## Data variables: cds (chr), dnum (int), snum (dbl), fpc1 (dbl), fpc2 ## (int[1d]), pw (dbl) summary(apiclus2_des) ## 2 - level Cluster Sampling design ## With (40, 126) clusters. ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.00367 0.03774 0.05284 0.04239 0.05284 0.05284 ## Population size (PSUs): 757 ## Data variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;snum&quot; &quot;fpc1&quot; &quot;fpc2&quot; &quot;pw&quot; The design objects are described as “2 - level Cluster Sampling design” and include the ids (cluster), FPC, and weight variables. The summary notes that the sample includes 40 first-level clusters (PSUs), which are school districts, and 126 second-level clusters (SSUs), which are schools. Additionally, the summary includes a numeric summary of the probabilities and the population size (number of PSUs) as 757. 3.3 Replicate weights Replicate weights are often included on analysis files instead of, or in addition to, the design variables (strata and PSUs). Replicate weights are used as another method to estimate variability and are often used specifically so that design variables are not published as a measure to limit disclosure risk. There are several types of replicate weights, including balanced repeated replication (BRR), Fay’s BRR, jackknife, and bootstrap methods. An overview of the process for using replicate weights is as follows: Divide the sample into subsample replicates that mirror the design of the sample Calculate weights for each replicate using the same procedures for the full-sample weight (i.e., nonresponse and post-stratification) Calculate estimates for each replicate using the same method as the full-sample estimate Calculate the estimated variance, which will be proportional to the variance of the replicate estimates The different types of replicate weights largely differ in step 1 - how the sample is divided into subsamples, and step 4 - which multiplication factors (scales) are used to multiply the variance. 3.3.1 Balanced Repeated Replication (BRR) Method The BRR method requires a stratified sample design with two PSUs in each stratum. Each replicate is constructed by deleting one PSU per stratum using a Hadamard matrix. For the PSU that is included, the weight is generally multiplied by two but may have other adjustments, such as post-stratification. A Hadamard matrix is a special square matrix with entries of +1 or -1 with mutually orthogonal rows. Hadamard matrices must have one row, two rows, or a multiple of four rows. To size of the Hadamard matrix is determined by the first multiple of 4 greater than or equal to the number of strata. For example, if a survey had 7 strata, the Hadamard matrix would be an \\(8\\times8\\) matrix. Additionally, a survey with 8 strata would also have an \\(8\\times8\\) Hadamard matrix. An example of a \\(4\\times4\\) Hadamard matrix is below: \\[ \\begin{array}{rrrr} +1 &amp;+1 &amp;+1 &amp;+1\\\\ +1&amp;-1&amp;+1&amp;-1\\\\ +1&amp;+1&amp;-1&amp;-1\\\\ +1 &amp;-1&amp;-1&amp;+1 \\end{array} \\] The columns specify the strata and the rows the replicate. In the first replicate, all the values are +1, so in each stratum, the first PSU would be used in the estimate. In the second replicate, the first PSU would be used in stratum 1 and 3, while the second PSU would be used in stratum 2 and 4. In the third replicate, the first PSU would be used in stratum 1 and 2, while the second PSU would be used in strata 3 and 4. Finally, in the fourth replicate, the first PSU would be used in strata 1 and 4, while the second PSU would be used in strata 2 and 3. The math A weighted estimate for the full sample is calculated as \\(\\hat{\\theta}\\), and then a weighted estimate for each replicate is calculated as \\(\\hat{\\theta}_r\\) for \\(R\\) replicates. The standard error of the estimate is calculated as follows: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] Specifying replicate weights in R requires specifying the type of replicate weights, the main weight variable, the replicate weight variables, and other options. One of the key options is for mse. If mse=TRUE, variances are computed around the point estimate \\((\\hat{\\theta})\\), whereas if mse=FALSE, variances are computed around the mean of the replicates \\((\\bar{\\theta})\\) instead which looks like this: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\bar{\\theta}\\right)^2}\\] where \\[\\bar{\\theta}=\\frac{1}{R}\\sum_{r=1}^R \\hat{\\theta}_r\\] The default option for mse is to use the global option of “survey.replicates.mse” which is set to FALSE initially unless a user changes it. To determine if mse should be set to TRUE or FALSE, read the survey documentation. If there is no indication in the survey documentation, for BRR, set mse to TRUE. The syntax Replicate weights generally come in groups and are sequentially numbered, such as PWGTP1, PWGTP2, …, PWGTP80 for the person weights in the American Community Survey (ACS) (U.S. Census Bureau 2021) or BRRWT1, BRRWT2, …, BRRWT96 in the 2015 Residential Energy Consumption Survey (RECS) (U.S. Energy Information Administration 2017). This makes it easy to use some of the tidy selection5 functions in R. For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated WT1, WT2, …, WT20, we can use the following syntax (both are equivalent): brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = all_of(str_c(&quot;WT&quot;, 1:20)), type = &quot;BRR&quot;, mse = TRUE ) brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = num_range(&quot;WT&quot;, 1:20), type = &quot;BRR&quot;, mse = TRUE ) If a dataset had WT for the main weight and had 20 BRR weights indicated REPWT1, REPWT2, …, REPWT20, the following syntax could be used (both are equivalent): brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = all_of(str_c(&quot;REPWT&quot;, 1:20)), type = &quot;BRR&quot;, mse = TRUE ) brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = starts_with(&quot;REPWT&quot;), type = &quot;BRR&quot;, mse = TRUE ) If the replicate weight variables are in the file consecutively, the following syntax can also be used: brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = REPWT1:REPWT20, type = &quot;BRR&quot;, mse = TRUE ) Typically, the replicate weights sum to a value similar to the main weight, as they are both supposed to provide population estimates. Rarely an alternative method will be used where the replicate weights have values of 0 or 2 in the case of BRR weights. This would be indicated in the documentation, and Section 3.4 and Chapter 4 discuss how to understand documentation. In this case, the replicate weights are not combined, and the option combined_weights = FALSE should be indicated, as the default value for this argument is TRUE. This specific syntax is shown below: brr_des &lt;- dat %&gt;% as_survey_rep( weights = WT, repweights = starts_with(&quot;REPWT&quot;), type = &quot;BRR&quot;, combined_weights = FALSE, mse = TRUE ) Example The {survey} package includes a data example from Section 12.2 of Levy and Lemeshow (2013). In this fictional data, two out of five ambulance stations were sampled from each of three emergency service areas (ESAs), thus BRR weights are appropriate with 2 PSUs (stations) sampled in each stratum (ESA). In the code below, BRR weights are created as was done by Levy and Lemeshow (2013). scdbrr &lt;- scd %&gt;% as_tibble() %&gt;% mutate( wt = 5 / 2, rep1 = 2 * c(1, 0, 1, 0, 1, 0), rep2 = 2 * c(1, 0, 0, 1, 0, 1), rep3 = 2 * c(0, 1, 1, 0, 0, 1), rep4 = 2 * c(0, 1, 0, 1, 1, 0) ) scdbrr ## # A tibble: 6 × 9 ## ESA ambulance arrests alive wt rep1 rep2 rep3 rep4 ## &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1 1 120 25 2.5 2 2 0 0 ## 2 1 2 78 24 2.5 0 0 2 2 ## 3 2 1 185 30 2.5 2 0 2 0 ## 4 2 2 228 49 2.5 0 2 0 2 ## 5 3 1 670 80 2.5 2 0 0 2 ## 6 3 2 530 70 2.5 0 2 2 0 To specify the BRR weights, the following syntax is used: scdbrr_des &lt;- scdbrr %&gt;% as_survey_rep( type = &quot;BRR&quot;, repweights = starts_with(&quot;rep&quot;), combined_weights = FALSE, weight = wt ) scdbrr_des ## Call: Called via srvyr ## Balanced Repeated Replicates with 4 replicates. ## Sampling variables: ## - repweights: `rep1 + rep2 + rep3 + rep4` ## - weights: wt ## Data variables: ESA (int), ambulance (int), arrests (dbl), alive (dbl), ## wt (dbl), rep1 (dbl), rep2 (dbl), rep3 (dbl), rep4 (dbl) summary(scdbrr_des) ## Call: Called via srvyr ## Balanced Repeated Replicates with 4 replicates. ## Sampling variables: ## - repweights: `rep1 + rep2 + rep3 + rep4` ## - weights: wt ## Data variables: ESA (int), ambulance (int), arrests (dbl), alive (dbl), ## wt (dbl), rep1 (dbl), rep2 (dbl), rep3 (dbl), rep4 (dbl) ## Variables: ## [1] &quot;ESA&quot; &quot;ambulance&quot; &quot;arrests&quot; &quot;alive&quot; &quot;wt&quot; ## [6] &quot;rep1&quot; &quot;rep2&quot; &quot;rep3&quot; &quot;rep4&quot; Note that combined_weights was specified as FALSE because these weights are simply specified as 0 and 2 and do not incorporate the overall weight. When printing the object, the type of replication is noted as Balanced Repeated Replicates, and the replicate weights and the weight variable are specified. Additionally, the summary lists the variables included. 3.3.2 Fay’s BRR Method Fay’s BRR method for replicate weights is similar to the BRR method in that it uses a Hadamard matrix to construct replicate weights. However, rather than deleting PSUs for each replicate, with Fay’s BRR half of the PSUs have a replicate weight which is the main weight multiplied by \\(\\rho\\), and the other half have the main weight multiplied by \\((2-\\rho)\\) where \\(0 \\le \\rho &lt; 1\\). Note that when \\(\\rho=0\\), this is equivalent to the standard BRR weights, and as \\(\\rho\\) becomes closer to 1, this method is more similar to jackknife discussed in the next section. To obtain the value of \\(\\rho\\), it is necessary to read the documentation as discussed in Section 3.4 and Chapter 4. The math The standard error estimate for \\(\\hat{\\theta}\\) is slightly different than the BRR, due to the addtion of the multipler of , and is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\frac{1}{R (1-\\rho)^2} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The syntax The syntax is very similar for BRR and Fay’s BRR. If a dataset had WT0 for the main weight and had 20 BRR weights indicated as WT1, WT2, …, WT20, and Fay’s multiplier is 0.5, use the following syntax: fay_des &lt;- dat %&gt;% as_survey_rep( weights = WT0, repweights = num_range(&quot;WT&quot;, 1:20), type = &quot;Fay&quot;, mse = TRUE, rho = 0.5 ) Example The 2015 RECS (U.S. Energy Information Administration 2017) uses Fay’s BRR weights with the final weight as NWEIGHT and replicate weights as BRRWT1 - BRRWT96 with \\(\\rho=0.5\\). On the file, DOEID is a unique identifier for each respondent, TOTALDOL is the total cost of energy, TOTSQFT_EN is the total square footage of the residence, and REGOINC is the Census region. We have already read in the RECS data and created a dataset called recs_in above in the prerequisites. To specify this design, use the following syntax: recs_des &lt;- recs_in %&gt;% as_survey_rep( weights = NWEIGHT, repweights = BRRWT1:BRRWT96, type = &quot;Fay&quot;, rho = 0.5, mse = TRUE, variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC) ) recs_des ## Call: Called via srvyr ## Fay&#39;s variance method (rho= 0.5 ) with 96 replicates and MSE variances. ## Sampling variables: ## - repweights: `BRRWT1 + BRRWT2 + BRRWT3 + BRRWT4 + BRRWT5 + BRRWT6 + BRRWT7 + BRRWT8 + BRRWT9 + BRRWT10 + BRRWT11 + BRRWT12 + BRRWT13 + BRRWT14 + BRRWT15 + BRRWT16 + BRRWT17 + BRRWT18 + BRRWT19 + BRRWT20 + BRRWT21 + BRRWT22 + BRRWT23 + BRRWT24 + BRRWT25 + BRRWT26 + BRRWT27 + BRRWT28 + BRRWT29 + BRRWT30 + BRRWT31 + BRRWT32 + BRRWT33 + BRRWT34 + BRRWT35 + BRRWT36 + BRRWT37 + BRRWT38 + BRRWT39 + BRRWT40 + BRRWT41 + BRRWT42 + BRRWT43 + BRRWT44 + BRRWT45 + BRRWT46 + BRRWT47 + BRRWT48 + BRRWT49 + BRRWT50 + BRRWT51 + \\n BRRWT52 + BRRWT53 + BRRWT54 + BRRWT55 + BRRWT56 + BRRWT57 + BRRWT58 + BRRWT59 + BRRWT60 + BRRWT61 + BRRWT62 + BRRWT63 + BRRWT64 + BRRWT65 + BRRWT66 + BRRWT67 + BRRWT68 + BRRWT69 + BRRWT70 + BRRWT71 + BRRWT72 + BRRWT73 + BRRWT74 + BRRWT75 + BRRWT76 + BRRWT77 + BRRWT78 + BRRWT79 + BRRWT80 + BRRWT81 + BRRWT82 + BRRWT83 + BRRWT84 + BRRWT85 + BRRWT86 + BRRWT87 + BRRWT88 + BRRWT89 + BRRWT90 + BRRWT91 + BRRWT92 + BRRWT93 + BRRWT94 + BRRWT95 + BRRWT96` ## - weights: NWEIGHT ## Data variables: DOEID (dbl), TOTALDOL (dbl), TOTSQFT_EN (dbl), REGIONC ## (dbl) summary(recs_des) ## Call: Called via srvyr ## Fay&#39;s variance method (rho= 0.5 ) with 96 replicates and MSE variances. ## Sampling variables: ## - repweights: `BRRWT1 + BRRWT2 + BRRWT3 + BRRWT4 + BRRWT5 + BRRWT6 + BRRWT7 + BRRWT8 + BRRWT9 + BRRWT10 + BRRWT11 + BRRWT12 + BRRWT13 + BRRWT14 + BRRWT15 + BRRWT16 + BRRWT17 + BRRWT18 + BRRWT19 + BRRWT20 + BRRWT21 + BRRWT22 + BRRWT23 + BRRWT24 + BRRWT25 + BRRWT26 + BRRWT27 + BRRWT28 + BRRWT29 + BRRWT30 + BRRWT31 + BRRWT32 + BRRWT33 + BRRWT34 + BRRWT35 + BRRWT36 + BRRWT37 + BRRWT38 + BRRWT39 + BRRWT40 + BRRWT41 + BRRWT42 + BRRWT43 + BRRWT44 + BRRWT45 + BRRWT46 + BRRWT47 + BRRWT48 + BRRWT49 + BRRWT50 + BRRWT51 + \\n BRRWT52 + BRRWT53 + BRRWT54 + BRRWT55 + BRRWT56 + BRRWT57 + BRRWT58 + BRRWT59 + BRRWT60 + BRRWT61 + BRRWT62 + BRRWT63 + BRRWT64 + BRRWT65 + BRRWT66 + BRRWT67 + BRRWT68 + BRRWT69 + BRRWT70 + BRRWT71 + BRRWT72 + BRRWT73 + BRRWT74 + BRRWT75 + BRRWT76 + BRRWT77 + BRRWT78 + BRRWT79 + BRRWT80 + BRRWT81 + BRRWT82 + BRRWT83 + BRRWT84 + BRRWT85 + BRRWT86 + BRRWT87 + BRRWT88 + BRRWT89 + BRRWT90 + BRRWT91 + BRRWT92 + BRRWT93 + BRRWT94 + BRRWT95 + BRRWT96` ## - weights: NWEIGHT ## Data variables: DOEID (dbl), TOTALDOL (dbl), TOTSQFT_EN (dbl), REGIONC ## (dbl) ## Variables: ## [1] &quot;DOEID&quot; &quot;TOTALDOL&quot; &quot;TOTSQFT_EN&quot; &quot;REGIONC&quot; In specifying the design, the variables option was also used to include which variables might be used in analyses. This is optional but can make your object smaller. When printing the design object or looking at the summary, the replicate weight type is re-iterated as Fay's variance method (rho= 0.5) with 96 replicates and MSE variances, and the variables are included. No weight or probability summary is included in this output as we have seen in some other design objects. 3.3.3 Jackknife method There are three jackknife estimators implemented in {srvyr} - Jackknife 1 (JK1), Jackknife n (JKn), and Jackknife 2 (JK2). The JK1 method can be used for unstratified designs, and replicates are created by removing one PSU at a time so the number of replicates is the same as the number of PSUs. If there is no clustering, then the PSU is the ultimate sampling unit (e.g., unit). The JKn method is used for stratified designs and requires two or more PSUs per stratum. In this case, each replicate is created by deleting one PSU from one stratum, so the number of replicates is the number of total PSUs across all strata. The JK2 method is a special case of JKn when there are exactly 2 PSUs sampled per stratum. For variance estimation, scaling constants must also be specified. The math For the JK1 method, the standard error estimate for \\(\\hat{\\theta}\\) is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\frac{R-1}{R} \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The JKn method is a bit more complex, but the coefficients are generally provided with restricted and public-use files. For each replicate, one stratum has a PSU removed, and the weights are adjusted by \\(n_h/(n_h-1)\\) where \\(n_h\\) is the number of PSUs in the stratum. The coefficients in other strata are set to 1. Denote the coefficient that results from this process for replicate \\(r\\) as \\(\\alpha_r\\), then the standard error estimate for \\(\\hat{\\theta}\\) is calculated as: \\[se(\\hat{\\theta})=\\sqrt{\\sum_{r=1}^R \\left(\\alpha_r \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] The syntax To specify the Jackknife method, the type would be JK1, JKn, or JK2. Additionally, the overall multiplier for JK1 is specified with the scale argument, whereas the replicate-specific multiplier (\\(\\alpha_r\\)) is specified with the scales argument. Consider a case for the JK1 method where the multiplier, \\((R-1)/R=19/20=0.95\\) and the dataset had WT0 for the main weight and had 20 JK1 weights indicated WT1, WT2, …, WT20, then the syntax would be jk1_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;JK1&quot;, mse=TRUE, scale=0.95) Consider a case for the JKn method where \\(\\alpha_r=0.1\\) for all replicates and the dataset had WT0 for the main weight and had 20 JK1 weights indicated as WT1, WT2, …, WT20, then the syntax would be: jkn_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;JKN&quot;, mse=TRUE, rscales=rep(0.1, 20)) Example The American Community Survey releases public use microdata with JK1 weights at the person and household level. This example includes data at the household level where the replicate weights are specified as WGTP1, …, WGTP80, and the main weight is WGTP (U.S. Census Bureau 2023). Using the {tidycensus} package6, data is downloaded from the Census API. For example, the code below has a request to obtain data for each person in each household in two Public Use Microdata Areas (PUMAs) in Durham County, NC7. The variables requested are NP (number of persons in the household), BDSP (number of bedrooms), HINCP (household income), and TYPEHUGQ (type of household). By default, several other variables will come along, including SERIALNO (a unique identifier for each household), SPORDER (a unique identifier for each person within each household), PUMA, ST (state), person weight (PWGTP), and the household weights (WGTP, WGTP1, …, WGTP80). Filtering to records where SPORDER=1 yields only one record per household and TYPEHUGQ=1 filters to only households and not group quarters. pums_in &lt;- get_pums( variables = c(&quot;NP&quot;, &quot;BDSP&quot;, &quot;HINCP&quot;), state = &quot;37&quot;, puma = c(&quot;01301&quot;, &quot;01302&quot;), rep_weights = &quot;housing&quot;, year = 2021, survey = &quot;acs5&quot;, variables_filter = list(SPORDER = 1, TYPEHUGQ = 1) ) ## Getting data from the 2017-2021 5-year ACS Public Use Microdata Sample pums_in ## # A tibble: 5,017 × 90 ## SERIALNO SPORDER NP BDSP HINCP PUMA ST TYPEHUGQ WGTP PWGTP ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 20170000… 1 2 1 37000 01301 37 1 15 15 ## 2 20170000… 1 3 3 81500 01302 37 1 18 19 ## 3 20170000… 1 3 3 104000 01301 37 1 21 21 ## 4 20170000… 1 1 3 8400 01302 37 1 48 48 ## 5 20170000… 1 4 4 139000 01302 37 1 22 23 ## 6 20170000… 1 1 1 18900 01302 37 1 33 34 ## 7 20170000… 1 1 0 8800 01302 37 1 54 54 ## 8 20170000… 1 1 2 24700 01302 37 1 9 9 ## 9 20170000… 1 3 4 227000 01301 37 1 11 11 ## 10 20170000… 1 2 2 87000 01302 37 1 14 13 ## # ℹ 5,007 more rows ## # ℹ 80 more variables: WGTP1 &lt;dbl&gt;, WGTP2 &lt;dbl&gt;, WGTP3 &lt;dbl&gt;, ## # WGTP4 &lt;dbl&gt;, WGTP5 &lt;dbl&gt;, WGTP6 &lt;dbl&gt;, WGTP7 &lt;dbl&gt;, WGTP8 &lt;dbl&gt;, ## # WGTP9 &lt;dbl&gt;, WGTP10 &lt;dbl&gt;, WGTP11 &lt;dbl&gt;, WGTP12 &lt;dbl&gt;, ## # WGTP13 &lt;dbl&gt;, WGTP14 &lt;dbl&gt;, WGTP15 &lt;dbl&gt;, WGTP16 &lt;dbl&gt;, ## # WGTP17 &lt;dbl&gt;, WGTP18 &lt;dbl&gt;, WGTP19 &lt;dbl&gt;, WGTP20 &lt;dbl&gt;, ## # WGTP21 &lt;dbl&gt;, WGTP22 &lt;dbl&gt;, WGTP23 &lt;dbl&gt;, WGTP24 &lt;dbl&gt;, … acs_des &lt;- pums_in %&gt;% as_survey_rep( weights = WGTP, repweights = num_range(&quot;WGTP&quot;, 1:80), type = &quot;JK1&quot;, mse = TRUE, scale = 4 / 80 ) acs_des ## Call: Called via srvyr ## Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances. ## Sampling variables: ## - repweights: `WGTP1 + WGTP2 + WGTP3 + WGTP4 + WGTP5 + WGTP6 + WGTP7 + WGTP8 + WGTP9 + WGTP10 + WGTP11 + WGTP12 + WGTP13 + WGTP14 + WGTP15 + WGTP16 + WGTP17 + WGTP18 + WGTP19 + WGTP20 + WGTP21 + WGTP22 + WGTP23 + WGTP24 + WGTP25 + WGTP26 + WGTP27 + WGTP28 + WGTP29 + WGTP30 + WGTP31 + WGTP32 + WGTP33 + WGTP34 + WGTP35 + WGTP36 + WGTP37 + WGTP38 + WGTP39 + WGTP40 + WGTP41 + WGTP42 + WGTP43 + WGTP44 + WGTP45 + WGTP46 + WGTP47 + WGTP48 + WGTP49 + WGTP50 + WGTP51 + WGTP52 + WGTP53 + WGTP54 + WGTP55 + WGTP56 + WGTP57 + \\n WGTP58 + WGTP59 + WGTP60 + WGTP61 + WGTP62 + WGTP63 + WGTP64 + WGTP65 + WGTP66 + WGTP67 + WGTP68 + WGTP69 + WGTP70 + WGTP71 + WGTP72 + WGTP73 + WGTP74 + WGTP75 + WGTP76 + WGTP77 + WGTP78 + WGTP79 + WGTP80` ## - weights: WGTP ## Data variables: SERIALNO (chr), SPORDER (dbl), NP (dbl), BDSP (dbl), ## HINCP (dbl), PUMA (chr), ST (chr), TYPEHUGQ (chr), WGTP (dbl), PWGTP ## (dbl), WGTP1 (dbl), WGTP2 (dbl), WGTP3 (dbl), WGTP4 (dbl), WGTP5 ## (dbl), WGTP6 (dbl), WGTP7 (dbl), WGTP8 (dbl), WGTP9 (dbl), WGTP10 ## (dbl), WGTP11 (dbl), WGTP12 (dbl), WGTP13 (dbl), WGTP14 (dbl), WGTP15 ## (dbl), WGTP16 (dbl), WGTP17 (dbl), WGTP18 (dbl), WGTP19 (dbl), WGTP20 ## (dbl), WGTP21 (dbl), WGTP22 (dbl), WGTP23 (dbl), WGTP24 (dbl), WGTP25 ## (dbl), WGTP26 (dbl), WGTP27 (dbl), WGTP28 (dbl), WGTP29 (dbl), WGTP30 ## (dbl), WGTP31 (dbl), WGTP32 (dbl), WGTP33 (dbl), WGTP34 (dbl), WGTP35 ## (dbl), WGTP36 (dbl), WGTP37 (dbl), WGTP38 (dbl), WGTP39 (dbl), WGTP40 ## (dbl), WGTP41 (dbl), WGTP42 (dbl), WGTP43 (dbl), WGTP44 (dbl), WGTP45 ## (dbl), WGTP46 (dbl), WGTP47 (dbl), WGTP48 (dbl), WGTP49 (dbl), WGTP50 ## (dbl), WGTP51 (dbl), WGTP52 (dbl), WGTP53 (dbl), WGTP54 (dbl), WGTP55 ## (dbl), WGTP56 (dbl), WGTP57 (dbl), WGTP58 (dbl), WGTP59 (dbl), WGTP60 ## (dbl), WGTP61 (dbl), WGTP62 (dbl), WGTP63 (dbl), WGTP64 (dbl), WGTP65 ## (dbl), WGTP66 (dbl), WGTP67 (dbl), WGTP68 (dbl), WGTP69 (dbl), WGTP70 ## (dbl), WGTP71 (dbl), WGTP72 (dbl), WGTP73 (dbl), WGTP74 (dbl), WGTP75 ## (dbl), WGTP76 (dbl), WGTP77 (dbl), WGTP78 (dbl), WGTP79 (dbl), WGTP80 ## (dbl) summary(acs_des) ## Call: Called via srvyr ## Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances. ## Sampling variables: ## - repweights: `WGTP1 + WGTP2 + WGTP3 + WGTP4 + WGTP5 + WGTP6 + WGTP7 + WGTP8 + WGTP9 + WGTP10 + WGTP11 + WGTP12 + WGTP13 + WGTP14 + WGTP15 + WGTP16 + WGTP17 + WGTP18 + WGTP19 + WGTP20 + WGTP21 + WGTP22 + WGTP23 + WGTP24 + WGTP25 + WGTP26 + WGTP27 + WGTP28 + WGTP29 + WGTP30 + WGTP31 + WGTP32 + WGTP33 + WGTP34 + WGTP35 + WGTP36 + WGTP37 + WGTP38 + WGTP39 + WGTP40 + WGTP41 + WGTP42 + WGTP43 + WGTP44 + WGTP45 + WGTP46 + WGTP47 + WGTP48 + WGTP49 + WGTP50 + WGTP51 + WGTP52 + WGTP53 + WGTP54 + WGTP55 + WGTP56 + WGTP57 + \\n WGTP58 + WGTP59 + WGTP60 + WGTP61 + WGTP62 + WGTP63 + WGTP64 + WGTP65 + WGTP66 + WGTP67 + WGTP68 + WGTP69 + WGTP70 + WGTP71 + WGTP72 + WGTP73 + WGTP74 + WGTP75 + WGTP76 + WGTP77 + WGTP78 + WGTP79 + WGTP80` ## - weights: WGTP ## Data variables: SERIALNO (chr), SPORDER (dbl), NP (dbl), BDSP (dbl), ## HINCP (dbl), PUMA (chr), ST (chr), TYPEHUGQ (chr), WGTP (dbl), PWGTP ## (dbl), WGTP1 (dbl), WGTP2 (dbl), WGTP3 (dbl), WGTP4 (dbl), WGTP5 ## (dbl), WGTP6 (dbl), WGTP7 (dbl), WGTP8 (dbl), WGTP9 (dbl), WGTP10 ## (dbl), WGTP11 (dbl), WGTP12 (dbl), WGTP13 (dbl), WGTP14 (dbl), WGTP15 ## (dbl), WGTP16 (dbl), WGTP17 (dbl), WGTP18 (dbl), WGTP19 (dbl), WGTP20 ## (dbl), WGTP21 (dbl), WGTP22 (dbl), WGTP23 (dbl), WGTP24 (dbl), WGTP25 ## (dbl), WGTP26 (dbl), WGTP27 (dbl), WGTP28 (dbl), WGTP29 (dbl), WGTP30 ## (dbl), WGTP31 (dbl), WGTP32 (dbl), WGTP33 (dbl), WGTP34 (dbl), WGTP35 ## (dbl), WGTP36 (dbl), WGTP37 (dbl), WGTP38 (dbl), WGTP39 (dbl), WGTP40 ## (dbl), WGTP41 (dbl), WGTP42 (dbl), WGTP43 (dbl), WGTP44 (dbl), WGTP45 ## (dbl), WGTP46 (dbl), WGTP47 (dbl), WGTP48 (dbl), WGTP49 (dbl), WGTP50 ## (dbl), WGTP51 (dbl), WGTP52 (dbl), WGTP53 (dbl), WGTP54 (dbl), WGTP55 ## (dbl), WGTP56 (dbl), WGTP57 (dbl), WGTP58 (dbl), WGTP59 (dbl), WGTP60 ## (dbl), WGTP61 (dbl), WGTP62 (dbl), WGTP63 (dbl), WGTP64 (dbl), WGTP65 ## (dbl), WGTP66 (dbl), WGTP67 (dbl), WGTP68 (dbl), WGTP69 (dbl), WGTP70 ## (dbl), WGTP71 (dbl), WGTP72 (dbl), WGTP73 (dbl), WGTP74 (dbl), WGTP75 ## (dbl), WGTP76 (dbl), WGTP77 (dbl), WGTP78 (dbl), WGTP79 (dbl), WGTP80 ## (dbl) ## Variables: ## [1] &quot;SERIALNO&quot; &quot;SPORDER&quot; &quot;NP&quot; &quot;BDSP&quot; &quot;HINCP&quot; &quot;PUMA&quot; ## [7] &quot;ST&quot; &quot;TYPEHUGQ&quot; &quot;WGTP&quot; &quot;PWGTP&quot; &quot;WGTP1&quot; &quot;WGTP2&quot; ## [13] &quot;WGTP3&quot; &quot;WGTP4&quot; &quot;WGTP5&quot; &quot;WGTP6&quot; &quot;WGTP7&quot; &quot;WGTP8&quot; ## [19] &quot;WGTP9&quot; &quot;WGTP10&quot; &quot;WGTP11&quot; &quot;WGTP12&quot; &quot;WGTP13&quot; &quot;WGTP14&quot; ## [25] &quot;WGTP15&quot; &quot;WGTP16&quot; &quot;WGTP17&quot; &quot;WGTP18&quot; &quot;WGTP19&quot; &quot;WGTP20&quot; ## [31] &quot;WGTP21&quot; &quot;WGTP22&quot; &quot;WGTP23&quot; &quot;WGTP24&quot; &quot;WGTP25&quot; &quot;WGTP26&quot; ## [37] &quot;WGTP27&quot; &quot;WGTP28&quot; &quot;WGTP29&quot; &quot;WGTP30&quot; &quot;WGTP31&quot; &quot;WGTP32&quot; ## [43] &quot;WGTP33&quot; &quot;WGTP34&quot; &quot;WGTP35&quot; &quot;WGTP36&quot; &quot;WGTP37&quot; &quot;WGTP38&quot; ## [49] &quot;WGTP39&quot; &quot;WGTP40&quot; &quot;WGTP41&quot; &quot;WGTP42&quot; &quot;WGTP43&quot; &quot;WGTP44&quot; ## [55] &quot;WGTP45&quot; &quot;WGTP46&quot; &quot;WGTP47&quot; &quot;WGTP48&quot; &quot;WGTP49&quot; &quot;WGTP50&quot; ## [61] &quot;WGTP51&quot; &quot;WGTP52&quot; &quot;WGTP53&quot; &quot;WGTP54&quot; &quot;WGTP55&quot; &quot;WGTP56&quot; ## [67] &quot;WGTP57&quot; &quot;WGTP58&quot; &quot;WGTP59&quot; &quot;WGTP60&quot; &quot;WGTP61&quot; &quot;WGTP62&quot; ## [73] &quot;WGTP63&quot; &quot;WGTP64&quot; &quot;WGTP65&quot; &quot;WGTP66&quot; &quot;WGTP67&quot; &quot;WGTP68&quot; ## [79] &quot;WGTP69&quot; &quot;WGTP70&quot; &quot;WGTP71&quot; &quot;WGTP72&quot; &quot;WGTP73&quot; &quot;WGTP74&quot; ## [85] &quot;WGTP75&quot; &quot;WGTP76&quot; &quot;WGTP77&quot; &quot;WGTP78&quot; &quot;WGTP79&quot; &quot;WGTP80&quot; When printing the design object or looking at the summary, the replicate weight type is re-iterated as Unstratified cluster jacknife (JK1) with 80 replicates and MSE variances, and the variables are included. No weight or probability summary is included. 3.3.4 Bootstrap Method In bootstrap resampling, replicates are created by selecting random samples of the PSUs with replacement (SRSWR). If there are \\(M\\) PSUs in the sample, then each replicate will be created by selecting a random sample of \\(M\\) PSUs with replacement. Each replicate is created independently, and the weights for each replicate are adjusted to reflect the population, generally using the same method as how the analysis weight was adjusted. The math A weighted estimate for the full sample is calculated as \\(\\hat{\\theta}\\), and then a weighted estimate for each replicate is calculated as \\(\\hat{\\theta}_r\\) for \\(R\\) replicates. Then the standard error of the estimate is calculated as follows: \\[se(\\hat{\\theta})=\\sqrt{\\alpha \\sum_{r=1}^R \\left( \\hat{\\theta}_r-\\hat{\\theta}\\right)^2}\\] where \\(\\alpha\\) is the scaling constant. The syntax If a dataset had WT0 for the main weight, 20 bootstrap weights indicated WT1, WT2, …, WT20, and \\(\\alpha=.02\\), use the following syntax: bs_des &lt;- dat %&gt;% as_survey_rep(weights = WT0, repweights= num_range(&quot;WT&quot;, 1:20), type=&quot;bootstrap&quot;, mse=TRUE, scale=.02) Note that the scale (\\(\\alpha\\)) is usually provided in the documentation and is a constant, so it is not provided as a variable in the tibble. Example Returning to the api example, we are going to create a dataset with bootstrap weights to use as an example. In this example, we construct a one-cluster design with fifty replicate weights. apiclus1_slim &lt;- apiclus1 %&gt;% as_tibble() %&gt;% arrange(dnum) %&gt;% select(cds, dnum, fpc, pw) set.seed(662152) apibw &lt;- bootweights( psu = apiclus1_slim$dnum, strata = rep(1, nrow(apiclus1_slim)), fpc = apiclus1_slim$fpc, replicates = 50 ) bwmata &lt;- apibw$repweights$weights[apibw$repweights$index,] * apiclus1_slim$pw apiclus1_slim &lt;- bwmata %&gt;% as.data.frame() %&gt;% set_names(str_c(&quot;pw&quot;, 1:50)) %&gt;% cbind(apiclus1_slim) %&gt;% as_tibble() %&gt;% select(cds, dnum, fpc, pw, everything()) apiclus1_slim ## # A tibble: 183 × 54 ## cds dnum fpc pw pw1 pw2 pw3 pw4 pw5 pw6 pw7 ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 2 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 3 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 4 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 5 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 6 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 7 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 8 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 9 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## 10 43693776… 61 757 33.8 33.8 0 0 33.8 0 33.8 0 ## # ℹ 173 more rows ## # ℹ 43 more variables: pw8 &lt;dbl&gt;, pw9 &lt;dbl&gt;, pw10 &lt;dbl&gt;, pw11 &lt;dbl&gt;, ## # pw12 &lt;dbl&gt;, pw13 &lt;dbl&gt;, pw14 &lt;dbl&gt;, pw15 &lt;dbl&gt;, pw16 &lt;dbl&gt;, ## # pw17 &lt;dbl&gt;, pw18 &lt;dbl&gt;, pw19 &lt;dbl&gt;, pw20 &lt;dbl&gt;, pw21 &lt;dbl&gt;, ## # pw22 &lt;dbl&gt;, pw23 &lt;dbl&gt;, pw24 &lt;dbl&gt;, pw25 &lt;dbl&gt;, pw26 &lt;dbl&gt;, ## # pw27 &lt;dbl&gt;, pw28 &lt;dbl&gt;, pw29 &lt;dbl&gt;, pw30 &lt;dbl&gt;, pw31 &lt;dbl&gt;, ## # pw32 &lt;dbl&gt;, pw33 &lt;dbl&gt;, pw34 &lt;dbl&gt;, pw35 &lt;dbl&gt;, pw36 &lt;dbl&gt;, … The output of apiclus1_slim includes the same variables we have seen in other api examples (see Table 3.1), but now additionally includes bootstrap weights pw1, …, pw50. When creating the survey design object, we use the bootstrap weights as the replicate weights. Additionally, with replicate weights we need to include the scale (\\(\\alpha\\)). For this example we created, \\(\\alpha\\) is \\(15/(14*49)=0.02186589\\). api1_bs_des &lt;- apiclus1_slim %&gt;% as_survey_rep( weights = pw, repweights = pw1:pw50, type = &quot;bootstrap&quot;, scale = 0.02186589, mse = TRUE ) api1_bs_des ## Call: Called via srvyr ## Survey bootstrap with 50 replicates and MSE variances. ## Sampling variables: ## - repweights: `pw1 + pw2 + pw3 + pw4 + pw5 + pw6 + pw7 + pw8 + pw9 + pw10 + pw11 + pw12 + pw13 + pw14 + pw15 + pw16 + pw17 + pw18 + pw19 + pw20 + pw21 + pw22 + pw23 + pw24 + pw25 + pw26 + pw27 + pw28 + pw29 + pw30 + pw31 + pw32 + pw33 + pw34 + pw35 + pw36 + pw37 + pw38 + pw39 + pw40 + pw41 + pw42 + pw43 + pw44 + pw45 + pw46 + pw47 + pw48 + pw49 + pw50` ## - weights: pw ## Data variables: cds (chr), dnum (int), fpc (dbl), pw (dbl), pw1 (dbl), ## pw2 (dbl), pw3 (dbl), pw4 (dbl), pw5 (dbl), pw6 (dbl), pw7 (dbl), pw8 ## (dbl), pw9 (dbl), pw10 (dbl), pw11 (dbl), pw12 (dbl), pw13 (dbl), ## pw14 (dbl), pw15 (dbl), pw16 (dbl), pw17 (dbl), pw18 (dbl), pw19 ## (dbl), pw20 (dbl), pw21 (dbl), pw22 (dbl), pw23 (dbl), pw24 (dbl), ## pw25 (dbl), pw26 (dbl), pw27 (dbl), pw28 (dbl), pw29 (dbl), pw30 ## (dbl), pw31 (dbl), pw32 (dbl), pw33 (dbl), pw34 (dbl), pw35 (dbl), ## pw36 (dbl), pw37 (dbl), pw38 (dbl), pw39 (dbl), pw40 (dbl), pw41 ## (dbl), pw42 (dbl), pw43 (dbl), pw44 (dbl), pw45 (dbl), pw46 (dbl), ## pw47 (dbl), pw48 (dbl), pw49 (dbl), pw50 (dbl) summary(api1_bs_des) ## Call: Called via srvyr ## Survey bootstrap with 50 replicates and MSE variances. ## Sampling variables: ## - repweights: `pw1 + pw2 + pw3 + pw4 + pw5 + pw6 + pw7 + pw8 + pw9 + pw10 + pw11 + pw12 + pw13 + pw14 + pw15 + pw16 + pw17 + pw18 + pw19 + pw20 + pw21 + pw22 + pw23 + pw24 + pw25 + pw26 + pw27 + pw28 + pw29 + pw30 + pw31 + pw32 + pw33 + pw34 + pw35 + pw36 + pw37 + pw38 + pw39 + pw40 + pw41 + pw42 + pw43 + pw44 + pw45 + pw46 + pw47 + pw48 + pw49 + pw50` ## - weights: pw ## Data variables: cds (chr), dnum (int), fpc (dbl), pw (dbl), pw1 (dbl), ## pw2 (dbl), pw3 (dbl), pw4 (dbl), pw5 (dbl), pw6 (dbl), pw7 (dbl), pw8 ## (dbl), pw9 (dbl), pw10 (dbl), pw11 (dbl), pw12 (dbl), pw13 (dbl), ## pw14 (dbl), pw15 (dbl), pw16 (dbl), pw17 (dbl), pw18 (dbl), pw19 ## (dbl), pw20 (dbl), pw21 (dbl), pw22 (dbl), pw23 (dbl), pw24 (dbl), ## pw25 (dbl), pw26 (dbl), pw27 (dbl), pw28 (dbl), pw29 (dbl), pw30 ## (dbl), pw31 (dbl), pw32 (dbl), pw33 (dbl), pw34 (dbl), pw35 (dbl), ## pw36 (dbl), pw37 (dbl), pw38 (dbl), pw39 (dbl), pw40 (dbl), pw41 ## (dbl), pw42 (dbl), pw43 (dbl), pw44 (dbl), pw45 (dbl), pw46 (dbl), ## pw47 (dbl), pw48 (dbl), pw49 (dbl), pw50 (dbl) ## Variables: ## [1] &quot;cds&quot; &quot;dnum&quot; &quot;fpc&quot; &quot;pw&quot; &quot;pw1&quot; &quot;pw2&quot; &quot;pw3&quot; &quot;pw4&quot; &quot;pw5&quot; ## [10] &quot;pw6&quot; &quot;pw7&quot; &quot;pw8&quot; &quot;pw9&quot; &quot;pw10&quot; &quot;pw11&quot; &quot;pw12&quot; &quot;pw13&quot; &quot;pw14&quot; ## [19] &quot;pw15&quot; &quot;pw16&quot; &quot;pw17&quot; &quot;pw18&quot; &quot;pw19&quot; &quot;pw20&quot; &quot;pw21&quot; &quot;pw22&quot; &quot;pw23&quot; ## [28] &quot;pw24&quot; &quot;pw25&quot; &quot;pw26&quot; &quot;pw27&quot; &quot;pw28&quot; &quot;pw29&quot; &quot;pw30&quot; &quot;pw31&quot; &quot;pw32&quot; ## [37] &quot;pw33&quot; &quot;pw34&quot; &quot;pw35&quot; &quot;pw36&quot; &quot;pw37&quot; &quot;pw38&quot; &quot;pw39&quot; &quot;pw40&quot; &quot;pw41&quot; ## [46] &quot;pw42&quot; &quot;pw43&quot; &quot;pw44&quot; &quot;pw45&quot; &quot;pw46&quot; &quot;pw47&quot; &quot;pw48&quot; &quot;pw49&quot; &quot;pw50&quot; As with other replicate design objects, when printing the object or looking at the summary, the replicate weights are provided along with the data variables. 3.4 Understanding survey design documentation SRS, stratified, and clustered designs are the backbone of sampling designs, and the features are often combined in one design. Additionally, rather than using SRS for selection, other sampling mechanisms are commonly used, such as probability proportional to size (PPS), systematic sampling, or selection with unequal probabilities, which are briefly described here. In PPS sampling, a size measure is constructed for each unit (e.g., the population of the PSU or the number of occupied housing units) and then units with larger size measures are more likely to be sampled. Systematic sampling is commonly used to ensure representation across a population. Units are sorted by a feature and then every \\(k\\) units are selected from a random start point so the sample is spread across the population. In addition to PPS, other unequal probabilities of selection may be used. For example, in a study of establishments (e.g., businesses or public institutions) that conducts a survey every year, an establishment that recently participated (e.g., participated last year) may have a reduced chance of selection in a subsequent round to reduce the burden on the establishment. To learn more about sampling designs, refer to Valliant, Dever, and Kreuter (2013), Cox et al. (2011), Cochran (1977), and Deming (1991). A common method of sampling is to stratify PSUs, select PSUs within the stratum using PPS selection, and then select units within the PSUs either with SRS or PPS. Reading survey documentation is an important first step in survey analysis to understand the design of the survey you are using and variables necessary to specify the design. Good documentation will highlight the variables necessary to specify the design. This is often found in User’s Guides, methodology, analysis guides, or technical documentation (see Chapter 4 for more details). Example For example, the 2017-2019 National Survey of Family Growth (NSFG)8 had a stratified multi-stage area probability sample. In the first stage, PSUs are counties or collections of counties and are stratified by Census region/division, size (population), and MSA status. Within each stratum, PSUs were selected via PPS. In the second stage, neighborhoods were selected within the sampled PSUs using PPS selection. In the third stage, housing units were selected within the sampled neighborhoods. In the fourth stage, a person was randomly chosen within the selected housing units among eligible persons using unequal probabilities based on the person’s age and sex. The public use file does not include all these levels of selection and instead has pseudo-strata and pseudo-clusters, which are the variables used in R to specify the design. As specified on page 4 of the documentation, the stratum variable is SEST, the cluster variable is SECU, and the weight variable is WGT2017_2019. Thus, to specify this design in R, use the following syntax: nsfg_des &lt;- nsfgdata %&gt;% as_survey_design(ids = SECU, strata = SEST, weights = WGT2017_2019) 3.5 Exercises The American National Election Studies (ANES) collect data before and after elections approximately every four years around the presidential election cycle. Each year with the data release, a user’s guide is also released9. What is the syntax for specifying the analysis of the full sample post-election data? anes_des &lt;- anes_data %&gt;% as_survey_design(weight) The General Social Survey is a survey that has been administered since 1972 on social, behavioral, and attitudinal topics. The 2016-2020 GSS Panel codebook10 provides examples of setting up syntax in SAS and Stata but not R. How would you specify the design in R? gss_des &lt;- gss_data %&gt;% as_survey_design(ids = VPSU_2, strata = VSTRAT_2, weights = WTSSNR_2) References "],["c04-understanding-survey-data-documentation.html", "Chapter 4 Understanding survey data documentation 4.1 Introduction 4.2 Types of survey documentation 4.3 Working with missing data 4.4 Example: American National Election Studies (ANES) 2020 survey documentation 4.5 Searching for public-use survey data", " Chapter 4 Understanding survey data documentation Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(censusapi) library(survey) library(srvyr) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES. Here is the code to read in the data. anes_raw &lt;- read_osf(&quot;anes_2020.rds&quot;) 4.1 Introduction Before diving into survey analysis, it’s crucial to review the survey documentation thoroughly. This documentation includes technical guides, questionnaires, codebooks, errata, and other useful resources. By taking the time to review these materials, we can gain a comprehensive understanding of the survey data (including research and design decisions discussed in Chapters 2 and 3) and effectively conduct our analysis. Survey documentation can vary in organization, type, and ease of use. The information may be stored in any format - PDFs, Excel spreadsheets, Word documents, etc. Some surveys save different documentation together, such as providing a single document containing both the codebook and the questionnaire. Others keep them in separate files. Despite these differences, it’s important to know what kind of information is available in each documentation type and what to focus on in each one. 4.2 Types of survey documentation 4.2.1 Technical documentation The technical documentation, also known as user guides or methodology/analysis guides, highlights the variables necessary to specify the survey design. We recommend focusing on these key sections: Introduction: The introduction orients us to the survey. This section provides the project’s background, the study’s purpose, and the main research questions. Study design: The study design section describes how researchers prepared and administered the survey. Sample: The sample section describes how researchers selected cases, any sampling error that occurred, and the limitations of the sample. This section can contain recommendations on how to use sampling weights. Look for weight information, whether the survey design is strata and/or clusters/PSUs or replicate weights, and any population sizes, finite population correction, or replicate weight scaling information. This documentation is critical in successfully running our analysis, and more detail on sample designs is available in Chapter 3. The technical documentation may include other helpful information. Some technical documentation includes syntax for SAS, SUDAAN, Stata, and/or R, meaning we don’t have to create this code from scratch. 4.2.2 Questionnaires A questionnaire is a series of questions asked to obtain information from survey respondents. A questionnaire gathers opinions, behaviors, or demographic data by employing different types of questions, such as closed-ended (e.g., radio button select one or check all that apply), open-ended (e.g., numeric or text), Likert scales, or ranking questions. It may randomize the display order of responses or include instructions to help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope. The questionnaire is an essential resource for understanding and interpreting the survey data (see Section 2.2.3), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (if only a subset of respondents were asked the question). Below in Figure 4.1, we show a question from the ANES 2020 questionnaire (American National Election Studies 2021). This figure shows a particular question’s question name (postvote_rvote), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if vote_pre = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. FIGURE 4.1: ANES 2020 Questionnaire Example The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure 4.2 shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) (Centers for Disease Control and Prevention (CDC) 2021). FIGURE 4.2: BRFSS 2021 Questionnaire Example Given the variety in how the survey information is presented in documentation, it is essential to consider the specific survey when interpreting the information presented in a questionnaire. For example, surveys that use different modes (e.g., web and mail) may have different question wording or skip logic as a web survey can include fills or automate skip logic. Reviewing the questionnaire documentation for the specific survey is crucial in understanding how to interpret the data and findings. 4.2.3 Codebooks While a questionnaire provides information about the questions asked to respondents, the codebook explains how the survey data was coded and recorded. The codebook lists details such as variable names, variable labels, variable meanings, codes for missing data, values labels, and value types (whether categorical or continuous, etc.). In particular, the codebook often includes information on missing data (as opposed to the questionnaire). The codebook enables us to understand and use the variables appropriately in our analysis. Figure 4.3 is a question from the ANES 2020 codebook (American National Election Studies 2022). This part indicates a particular variable’s name (V202066), question wording, value labels, universe, and associated survey question (postvote_rvote). FIGURE 4.3: ANES 2020 Codebook Example Reviewing both questionnaires and codebooks in parallel is important (Figures 4.1 and 4.3, as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables. 4.2.4 Errata An erratum (singular) or errata (plural) is a document that lists errors found in a publication or dataset, such as a survey questionnaire. The purpose of an erratum is to correct or update mistakes or inaccuracies in the original document. For example, if a survey questionnaire contains an error, such as a typo or confusing wording, the researchers would release an erratum that provides a corrected version. Another type of erratum is incorrectly programmed skips in an electronic survey where questions are skipped by the respondent when they should not have been. Review these errata before conducting any analysis to ensure the accuracy and reliability of the survey data and analysis. 4.2.5 Additional resources Surveys may have additional resources, such as interviewer instructions or “show cards” provided to respondents during interviewer administed surveys to help respondents answer questions. Explore the survey website to find out what resources were used and in what contexts. 4.3 Working with missing data Missing data in surveys refers to situations where participants do not provide complete responses to survey questions. Respondents may not have seen a question by design. Or, they may not respond to a question for various other reasons, such as not wanting to answer a particular question, not understanding the question, or simply forgetting to answer. Missing data can be a significant problem in survey analysis, as it can introduce bias and reduce the representativeness of the data. Missing data typically falls into two main categories, either missing by design or unintentional missing mechanisms. Missing by design/questionnaire skip logic: This type of missingness occurs when certain respondents are intentionally directed to skip specific questions based on their previous responses or characteristics. For example, in a survey about employment, if a respondent indicates that they are not employed, they may be directed to skip questions related to their job responsibilities. Unintentional missing data: This type of missingness occurs when researchers did not intend for there to be missing data on a particular question. For example, respondents did not finish the survey or refused to answer individual questions. There are 3 main types of unintentional missing data that each should be considered and may need to be handled differently (Mack, Su, and Westreich 2018; Schafer and Graham 2002): Missing completely at random (MCAR): The missing data is unrelated to both observed and unobserved data, and the probability of being missing is the same across all cases. For example, if a respondent missed a question because they had to leave the survey early due to an emergency. Missing at random (MAR): The missing data is related to observed data but not unobserved data, and the probability of being missing is the same within groups. For example, if older respondents choose not to answer specific questions but younger respondents do answer them, and know the respondent’s age. Missing not at random (MNAR): The missing data is related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have “Yes” responses coded to 1, “No” responses coded to 2, and missing responses coded to -9. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable V202066 from the ANES (Figure 4.3), -9 represents “Refused,” -7 means that the response was deleted due to an incomplete interview, -6 means that there is no response because there was no follow-up interview, and -1 means “Inapplicable” (due to the designed skip pattern). When running analysis in R, we must handle missing responses as missing data (i.e., NA) and not numeric data. If missing responses are treated as zeros or arbitrary values, they can artificially alter summary statistics or introduce spurious patterns in the analysis. Recoding these values to NA will allow you to handle missing data in different ways in R, such as using functions like na.omit(), complete.cases(), or specialized packages like {tidyimpute} or {mice}. These tools allow us to treat missing responses as missing data to conduct your analysis accurately and obtain valid results. Visualizing the missing data can also help to inform the types of missing data that are present. The {naniar} package provides many valuable missing data visualizations, such as using gg_miss_var() to see the count or percent of missing data points by variable or gg_miss_fct() to see relationships in missing data across levels of a factor variable. Investigating the relationships and nature of the missing data before running models can ensure that the missing data is accurately accounted for. 4.3.1 Accounting for questionnaire skip patterns Questionnaires may include skip patterns, in which specific questions are skipped based on the respondent’s answers to earlier questions. For example, if a respondent answers “no” to a question on whether they voted in the last election, they may be instructed to skip a series of questions related to that election. Skip patterns are used in surveys to streamline the data collection process and avoid asking irrelevant questions to certain respondents. However, they also result in missing data, as respondents cannot respond to questions they were instructed to skip. Analyzing the data missing by design requires understanding the underlying reasons for the skip patterns. Our survey analysis must properly account for skip patterns to ensure unbiased and accurate population parameters. Dealing with missing data due to skip patterns requires careful consideration. We can treat skipped questions as missing data. Or, we can run an analysis that accounts for the conditional dependence between the skipped and answered questions. The appropriate method depends on the nature and extent of the skip patterns, the research questions, and the methodology. For example, if we wanted to know what proportion of eligible voters voted for a particular candidate, the denominator would be all eligible voters, while if we wanted to know what proportion voted for a specific candidate among those who voted, the denominator would be those who voted. We include or exclude missing values depending on our research question. 4.3.2 Accounting for Missing Completely at Random (MCAR), Missing at Random (MAR), and Missing not at Random (MNAR) missingness When dealing with missing data that is MCAR, MAR, or MNAR, we must consider the implications of how we handle these missing data and avoid introducing more sources of bias. For instance, we can analyze only the respondents who answered all questions by performing listwise deletion, which drops all rows from a data frame with a missing value in any column. We can use the function tidyr::drop_na() for listwise deletion. For example, let’s say we have a dataset dat that has one complete case and 2 cases with some missing data. dat &lt;- tibble::tribble(~ col1, ~ col2, ~ col3, &quot;a&quot;, &quot;d&quot;, &quot;e&quot;, &quot;b&quot;, NA, NA, &quot;c&quot;, NA, &quot;f&quot;) dat ## # A tibble: 3 × 3 ## col1 col2 col3 ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; ## 1 a d e ## 2 b &lt;NA&gt; &lt;NA&gt; ## 3 c &lt;NA&gt; f If we use the tidyr::drop_na() funtion, only the first case will remain as the other two cases have at least one missing value. dat %&gt;% tidyr::drop_na() ## # A tibble: 1 × 3 ## col1 col2 col3 ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; ## 1 a d e If the data is not missing completely at random (MCAR), then listwise deletion may produce biased estimates if there is a pattern of respondents who do not respond to specific questions. In these circumstances, we should explore other options, such as multiple imputation or weighted estimation. However, imputation is not always appropriate and can introduce its own sources of bias. See Allison (2002) for more detail. In summary, we need to deeply understand the types and reasons for missing data in our survey before running any analysis. The survey documentation is an important resource for understanding how to deal with missing data. Carefully review the documentation for guidance from the researchers. 4.4 Example: American National Election Studies (ANES) 2020 survey documentation Let’s look at the survey documentation for the American National Election Studies (ANES) 2020. The survey website is located at https://electionstudies.org/data-center/2020-time-series-study/. Navigating to “User Guide and Codebook,” (American National Election Studies 2022) we can download the PDF that contains the survey documentation, titled “ANES 2020 Time Series Study Full Release: User Guide and Codebook”. Do not be daunted by the 796-page PDF. We can focus on the most critical information. Introduction The first section in the User Guide explains that the ANES 2020 Times Series Study continues a series of election surveys conducted since 1948. These surveys contain data on public opinion and voting behavior in the U.S. presidential elections. The introduction also includes information about the modes used for data collection (web, live video interviewing, or CATI). Additionally, there is a summary of the number of pre-election interviews (8,280) and post-election re-interviews (7,449). Sample Design and Respondent Recruitment The section “Sample Design and Respondent Recruitment” provides more detail about how the survey was conducted in that it was a sequential mixed-mode design. This means that all 3 modes were conducted one after another and not at the same time. Additionally, it indicates that for the 2020 survey they resampled all respondents who participated in 2016 ANES, along with a freshly-drawn cross-section: The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. The document continues with more details on the sample groups. Data Analysis, Weights, and Variance Estimation The section “Data Analysis, Weights, and Variance Estimation” includes information on weights and strata/cluster variables. Reading through, we can find the full sample weight variables: For analysis of the complete set of cases using pre-election data only, including all cases and representative of the 2020 electorate, use the full sample pre-election weight, V200010a. For analysis including post-election data for the complete set of participants (i.e., analysis of post-election data only or a combination of pre- and post-election data), use the full sample post-election weight, V200010b. Additional weights are provided for analysis of subsets of the data… The document provides more information about the variables, summarized below: For weight Use variance unit/PSU/cluster and use variance stratum V200010a V200010c V200010d V200010b V200010c V200010d The user guide references a supplemental document called “How to Analyze ANES Survey Data” (DeBell 2010) as a ‘how-to guide’ to help us with our analysis. In the how to guide, we learn more about the weights including that the weights sum to the sample size and not the population. If we want to create estimates at the population level instead of the sample level, we will need to adjust the weights to the population. Let’s recall the “Sample Design and Respondent Recruitment” section: The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 US states or the District of Columbia. To weight to the population, we need to determine the total population size when the survey was conducted. We will use Current Population Survey (CPS) to find a number of the non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or D.C. in November of 2020. The {censusapi} package allows us to run a reproducible analysis of this data. # Note that we need a Census key to access the Census API cps_state_in &lt;- getCensus( name = &quot;cps/basic/nov&quot;, vintage = 2020, region = &quot;state&quot;, vars = c(&quot;HRHHID&quot;, &quot;HRMONTH&quot;, &quot;HRYEAR4&quot;, &quot;PRTAGE&quot;, &quot;PRCITSHP&quot;, &quot;PWSSWGT&quot;), key = Sys.getenv(&quot;CENSUS_KEY&quot;) ) cps_state &lt;- cps_state_in %&gt;% as_tibble() %&gt;% mutate(across(.cols = everything(), .fns = as.numeric)) Once we’ve pulled the data, we want to ensure that the data only includes the 50 U.S. states and D.C. to match the desigred population. cps_state %&gt;% count(state) ## # A tibble: 51 × 2 ## state n ## &lt;dbl&gt; &lt;int&gt; ## 1 1 2406 ## 2 2 1289 ## 3 4 1969 ## 4 5 1988 ## 5 6 9574 ## 6 8 1365 ## 7 9 1157 ## 8 10 1285 ## 9 11 1622 ## 10 12 5055 ## # ℹ 41 more rows Next, we confirm that all the data is from November (HRMONTH == 11) of 2020 (HRYEAR4 == 2020). cps_state %&gt;% count(HRMONTH, HRYEAR4) ## # A tibble: 1 × 3 ## HRMONTH HRYEAR4 n ## &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 11 2020 112037 We then filter to only those who are 18 years or older (PRTAGE &gt;= 18) and have U.S. citizenship (PRCITSHIP %in% (1:4)). targetpop &lt;- cps_state %&gt;% as_tibble() %&gt;% filter(PRTAGE &gt;= 18, PRCITSHP %in% (1:4)) %&gt;% pull(PWSSWGT) %&gt;% sum() targetpop ## [1] 231592693 The target population in 2020 is 231,592,693. This information gives us what we need to create the post-election survey object with {srvyr}. Using the raw ANES data we pulled in at the beginning of this chapter we will adjust the weighting variable (V200010b) using the target population we just calculated (targetpop). anes_in &lt;- anes_raw %&gt;% mutate(Weight = V200010b / sum(V200010b) * targetpop) Once we have the weights adjusted to the population, we can then create the survey design using our new weight variable in the weights argument and use the strata and cluster variables identified in the users manual. anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = V200010d, ids = V200010c, nest = TRUE ) summary(anes_des) ## Stratified 1 - level Cluster Sampling design (with replacement) ## With (101) clusters. ## Called via srvyr ## Probabilities: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 5.0e-06 2.7e-05 4.7e-05 7.7e-05 8.3e-05 3.9e-03 ## Stratum Sizes: ## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ## obs 167 148 158 151 147 172 163 159 160 159 137 179 148 160 159 ## design.PSU 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 ## obs 148 158 156 154 144 170 146 165 147 169 165 172 133 157 167 ## design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 ## obs 154 143 143 124 138 130 136 145 140 125 158 146 130 126 126 ## design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## 46 47 48 49 50 ## obs 135 133 140 133 130 ## design.PSU 2 2 2 2 2 ## actual.PSU 2 2 2 2 2 ## Data variables: ## [1] &quot;V200010b&quot; &quot;V200010d&quot; ## [3] &quot;V200010c&quot; &quot;V200002&quot; ## [5] &quot;V201006&quot; &quot;V201102&quot; ## [7] &quot;V201101&quot; &quot;V201103&quot; ## [9] &quot;V201025x&quot; &quot;V201231x&quot; ## [11] &quot;V201233&quot; &quot;V201237&quot; ## [13] &quot;V201507x&quot; &quot;V201510&quot; ## [15] &quot;V201549x&quot; &quot;V201600&quot; ## [17] &quot;V201617x&quot; &quot;V202066&quot; ## [19] &quot;V202109x&quot; &quot;V202072&quot; ## [21] &quot;V202073&quot; &quot;V202110x&quot; ## [23] &quot;InterviewMode&quot; &quot;Weight&quot; ## [25] &quot;Stratum&quot; &quot;VarUnit&quot; ## [27] &quot;Age&quot; &quot;AgeGroup&quot; ## [29] &quot;Gender&quot; &quot;RaceEth&quot; ## [31] &quot;PartyID&quot; &quot;Education&quot; ## [33] &quot;Income&quot; &quot;Income7&quot; ## [35] &quot;CampaignInterest&quot; &quot;TrustGovernment&quot; ## [37] &quot;TrustPeople&quot; &quot;VotedPres2016&quot; ## [39] &quot;VotedPres2016_selection&quot; &quot;VotedPres2020&quot; ## [41] &quot;VotedPres2020_selection&quot; &quot;EarlyVote2020&quot; Now that we have the survey design object, we can continue to reference the ANES documentation including the questionnaire and the codebook as we select variables for analysis and gain insights into the findings. 4.5 Searching for public-use survey data Throughout this book we use different public-use datasets from surveys. Above, we provided an example from the American National Election Survey (ANES) and we will continue to use this dataset throughout the book. Additionally, we use the Residential Energy Consumption Survey (RECS), the National Crime Victimization Survey (NCVS), and the AmericasBarometer surveys. As we mentioned in Chapter 2, instead of creating a new survey researchers should look for existing data that can provide insights into their research questions. One of the greatest sources of data is the government. For example, in the U.S., you can get data directly from the various statistical agencies as we have with RECS and NCVS. Other countries often have data available through their official statistics offices such as the Office for National Statistics in the U.K. In addition to government data, many researchers will make their data publicly available through repositories such as the Inter-university Consortium for Political and Social Research (ICPSR) variable search or the Odum Institute Data Archive. Searching these repositories or other compiled lists (e.g., Analyze Survey Data for Free - asdfree.com can be efficient ways to identify surveys with questions related to the researcher’s topic of interest. References "],["c05-descriptive-analysis.html", "Chapter 5 Descriptive analyses in srvyr 5.1 Introduction 5.2 Similarities between {dplyr} and {srvyr} functions 5.3 Deciding on descriptive analyses 5.4 Measures of distribution 5.5 Measures of central tendency 5.6 Measures of dispersion 5.7 Measures of relationship 5.8 Additional topics 5.9 Exercises", " Chapter 5 Descriptive analyses in srvyr Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) knitr::opts_chunk$set(tidy = TRUE) To help explain the similarities between {dplyr} functions and {srvyr} functions, this chapter will use the mtcars and iris datasets that are built-in to R and apistrat data that comes in the {survey} package: data(api) dstrata &lt;- apistrat %&gt;% as_survey_design(strata = stype, weights = pw) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 5.1 Introduction Recall from Chapter 3 the general process for estimation with the {srvyr} package: Create a tbl_svy object using srvyr::as_survey_design() or srvyr::as_survey_rep(). Subset the data for subpopulations using srvyr::filter(), if needed. Specify domains of analysis using srvyr::group_by(), if needed. Within srvyr::summarize(), specify variables to calculate means, totals, proportions, quantiles, and more. Filtering should be done after creating the tbl_svy object (using as_survey_design() or as_survey_rep()) because survey objects incorporate the survey design information into the resulting object. 5.2 Similarities between {dplyr} and {srvyr} functions One of the major advantages of using {srvyr} is that it applies {dplyr}-like syntax to the {survey} package. We can use pipes to specify a tbl_svy object, apply a function, and then feed that output into the next function’s first argument. Functions follow the ‘tidy’ convention of snake_case functions names. In the example below, the mean and median are calculated for the variable mpg on the mtcars dataset. mtcars %&gt;% summarize(mpg_mean = mean(mpg), mpg_median = median(mpg)) ## mpg_mean mpg_median ## 1 20.09 19.2 Similarly, in the next example, the variance and standard deviation of the variable api00 are calculated for the tbl_svy object dstrata. Note how similar the syntax is. When we dig into the functions later, we will show that the results output are similar in that one row is output for each group (if there are groups) but there will be more columns output. Specifically, by default, the standard error of the statistic is calculated in addition to the statistic. dstrata %&gt;% summarize(api00_mean = survey_mean(api00), api00_med = survey_median(api00)) ## # A tibble: 1 × 4 ## api00_mean api00_mean_se api00_med api00_med_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 662. 9.54 668 13.7 The functions in {srvyr} also play nicely with other tidyverse functions. If we wanted to select columns that have something in common, we can use {tidyselect} functions such as starts_with(), num_range(), etc.. In the examples below, a combination of across() and starts_with() to calculate the mean of variables starting with “Sepal” in the iris dataframe and then starting with api in the dstrata survey object. iris %&gt;% summarize( across(starts_with(&quot;Sepal&quot;), mean) ) ## Sepal.Length Sepal.Width ## 1 5.843 3.057 dstrata %&gt;% summarize( across(starts_with(&quot;api&quot;), survey_mean) ) ## # A tibble: 1 × 6 ## api00 api00_se api99 api99_se api.stu api.stu_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 662. 9.54 629. 10.1 498. 16.4 We can use {dplyr} verbs such as mutate(), filter(), etc., on our survey object. dstrata_mod &lt;- dstrata %&gt;% mutate(api_diff = api00 - api99) %&gt;% filter(stype==&quot;E&quot;) %&gt;% select(stype, api99, api00, api_diff, api_students=api.stu) dstrata_mod ## Stratified Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - weights: pw ## Data variables: stype (fct), api99 (int), api00 (int), api_diff (int), ## api_students (int) dstrata ## Stratified Independent Sampling design (with replacement) ## Called via srvyr ## Sampling variables: ## - ids: `1` ## - strata: stype ## - weights: pw ## Data variables: cds (chr), stype (fct), name (chr), sname (chr), snum ## (dbl), dname (chr), dnum (int), cname (chr), cnum (int), flag (int), ## pcttest (int), api00 (int), api99 (int), target (int), growth (int), ## sch.wide (fct), comp.imp (fct), both (fct), awards (fct), meals ## (int), ell (int), yr.rnd (fct), mobility (int), acs.k3 (int), acs.46 ## (int), acs.core (int), pct.resp (int), not.hsg (int), hsg (int), ## some.col (int), col.grad (int), grad.sch (int), avg.ed (dbl), full ## (int), emer (int), enroll (int), api.stu (int), pw (dbl), fpc (dbl) Instead of data frames or tibbles, {srvyr} functions are meant for tbl_svy objects. Attempting to run data manipulation on non-tbl_svy objects will result in an error as shown in the example below while using the mtcars data frame which is not tbl_svy object. mtcars %&gt;% summarize(mpg_mean = survey_mean(mpg)) ## Error in `summarize()`: ## ℹ In argument: `mpg_mean = survey_mean(mpg)`. ## Caused by error in `cur_svy()`: ## ! Survey context not set A few functions in {srvyr} parallel functions in {dplyr}, such as srvyr::summarize() and srvyr::group_by(). Unlike {srvyr}-specific verbs, the package recognizes these parallel functions on a non-survey object. It will not error and instead give the equivalent output from {dplyr}: mtcars %&gt;% srvyr::summarize(mpg_mean = mean(mpg)) ## mpg_mean ## 1 20.09 Because this book focuses on survey analysis, most of our pipes will stem from a survey object. We will not include the namespace for each function (e.g., srvyr::summarize()). Several functions in {srvyr} must be called within srvyr::summarize() with the exception of srvyr::survey_count() and srvyr::survey_tally() much like dplyr::count() and dplyr::tally() are not called within dplyr::summarize(). These verbs can be used in conjunction with group_by() or by/.by, applying the functions on a group-by-group basis to create grouped summaries. mtcars %&gt;% group_by(cyl) %&gt;% dplyr::summarize(mpg_mean = mean(mpg)) ## # A tibble: 3 × 2 ## cyl mpg_mean ## &lt;dbl&gt; &lt;dbl&gt; ## 1 4 26.7 ## 2 6 19.7 ## 3 8 15.1 We use a similar setup to summarize data in {srvyr}. dstrata %&gt;% group_by(stype) %&gt;% summarize(api00_mean = survey_mean(api00), api00_median = survey_median(api00)) ## # A tibble: 3 × 5 ## stype api00_mean api00_mean_se api00_median api00_median_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 E 674. 12.5 671 20.7 ## 2 H 626. 15.5 635 21.6 ## 3 M 637. 16.6 648 24.1 5.3 Deciding on descriptive analyses We select measures based on the type of variable we are analyzing. Variables are classified as categorical/nominal, ordinal, and interval/ratio. Categorical/nominal data: variables with levels or descriptions that cannot be ordered, such as the region of the country (North, South, East, and West) Ordinal data: variables that can be ordered, such as those from a Likert scale (strongly disagree, disagree, agree, and strongly agree) Interval/ratio: variables that are counted or measured, such as the total cost of electricity Within interval/ratio data are discrete variables, whose values are whole numbers, such as a count of children, and continuous variables, whose values can lie anywhere on an interval, such as weight. If our variable is categorical, such as gender or occupation, we might use frequency counts or percentages. In contrast, if the variable is continuous, such as income or age, we might use mean, median, or standard deviation. Choosing appropriate measures is important to reach valid conclusions. Different variable types have distinct properties and levels of measurement, and we cannot apply all measures to all variables. Our survey data may represent categorical variables using numeric codes. For example, the North, South, East, and West regions of the United States might be coded as 1, 2, 3, and 4, respectively. Though this is a categorical variable, this variable might be automatically read as numeric values when we import our data. This can lead to the common mistake of applying survey_mean() to all numeric columns in the dataset, including categorical values. This practice can lead to incorrect inferences because categorical variables lack a natural zero point or linear ordering, making measures like mean inappropriate. Instead, it is crucial to inspect the codebook, understand the variable type, and choose appropriate measures, such as frequency counts or percentages, to describe the data across regions. Descriptive analysis can be categorized into univariate and multivariate analysis, depending on the number of variables we are analyzing. Below, we describe the descriptive analysis for single or multiple variables and the {srvyr} functions associated with the measures. 5.4 Measures of distribution Measures of distribution describe how often an event or response occurs. These measures include counts, proportions, and totals. Examples: the proportion of students in California who did or did not receive an award based on their Academic Performance score; the estimated number of U.S. citizens who voted in the last election; the total amount of money residential households spend on electricity in a year The {srvyr} package includes several functions for determining measures of distribution and all must be called within the summarize() function except survey_count() and survey_tally(). The survey_count() and survey_tally() functions calculate weighted observations by group The survey_total() function calculates totals The survey_prop() function calculates proportions The survey_quantile() function calculates quantiles 5.4.1 Counts and cross-tabulations With survey_count(), we can calculate the estimated population counts for a given variable or combination of variables. Sometimes, these are referred to as cross-tabulations or crosstabs, for short. These summaries should be applied to categorical data and is used to get estimated counts of the population size of groups from the survey. Syntax The syntax is very similar to the dplyr::count() syntax; however, as noted above, it can only be called on tbl_svy objects. Let’s explore the syntax: survey_count( x, ..., wt = NULL, sort = FALSE, name = &quot;n&quot;, .drop = dplyr::group_by_drop_default(x), vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;) ) survey_tally( x, wt, sort = FALSE, name = &quot;n&quot;, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;) ) The arguments are: x: a tbl_svy object created by as_survey ...: variables to group by, passed to group_by wt: a variable to weight on in addition to the survey weights, defaults to NULL sort: how to sort the variables, defaults to FALSE name: the name of the count variable, defaults to n .drop: whether to drop empty groups vartype: type(s) of variation estimate to calculate, defaults to se (standard error) We will discuss vartype in Section 5.8.2 as this option occurs in all functions. Examples For an example, let’s use the Residential Energy Consumption Survey (RECS), which provides energy consumption and expenditures data. RECS funded by Energy Information Administration and collects information through energy suppliers through in-person, phone, and web interviews. It has been fielded 14 times between 1950 and 2020 and includes questions about appliances, electronics, heating, air conditioning (A/C), temperatures, water heating, lighting, energy bills, respondent demographics, and energy assistance. The survey targets primarily occupied housing units in the US. If we wanted to obtain the estimated number of households in the U.S. (the target population) using the RECS data we could use survey_count(). If we do not specify any variables in the survey_count() function, it will output the estimated population count (n) and standard error (n_se). recs_des %&gt;% survey_count() ## # A tibble: 1 × 2 ## n n_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 Thus, the estimated number of households in the U.S. is 118,208,250.0005889982. To calculate the estimated number of observations for subgroups, such as Region and Division, we can add the variables of interest into the function. In the example below, the estimated number of housing units by region and division is calculated. Additionally, the name of the count variable is changed to “N” from the default (“n”). recs_des %&gt;% survey_count(Region, Division, name = &quot;N&quot;) ## # A tibble: 10 × 4 ## Region Division N N_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast New England 5628844. 0.00642 ## 2 Northeast Middle Atlantic 15377694. 0.000130 ## 3 Midwest East North Central 18094391. 0.000274 ## 4 Midwest West North Central 8277344. 0.000113 ## 5 South South Atlantic 23474851. 0.00555 ## 6 South East South Central 7197189. 0.0240 ## 7 South West South Central 13769934. 0.000423 ## 8 West Mountain North 4246877. 0.000147 ## 9 West Mountain South 4266870. 0.0193 ## 10 West Pacific 17874256. 0.000481 When we run the crosstab, we see there are an estimated 5,628,844.0000229999423 housing units in the New England Division. The survey_tally() function is similar to the survey_count(), but the survey_tally() function lacks the following arguments ... (used to specify the by groups) and .drop. To get the estimated overall population, survey_count() and survey_total() will be identical and the example below yields the same results as using survey_count() previously. recs_des %&gt;% survey_tally() ## # A tibble: 1 × 2 ## n n_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 However, if we wanted the estimated population total by region and division, we will get an error if we try to use the same syntax: recs_des %&gt;% survey_tally(Region, Division, name = &quot;N&quot;) ## Error in `dplyr::summarise()`: ## ℹ In argument: `N = survey_total(Region, vartype = vartype, ## na.rm = TRUE)`. ## Caused by error: ## ! Factor not allowed in survey functions, should be used as a grouping variable. Instead, use a the group_by() function prior to using survey_tally() as is illustrated below: recs_des %&gt;% group_by(Region, Division) %&gt;% survey_tally(name = &quot;N&quot;) ## # A tibble: 10 × 4 ## # Groups: Region [4] ## Region Division N N_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast New England 5628844. 0.00642 ## 2 Northeast Middle Atlantic 15377694. 0.000130 ## 3 Midwest East North Central 18094391. 0.000274 ## 4 Midwest West North Central 8277344. 0.000113 ## 5 South South Atlantic 23474851. 0.00555 ## 6 South East South Central 7197189. 0.0240 ## 7 South West South Central 13769934. 0.000423 ## 8 West Mountain North 4246877. 0.000147 ## 9 West Mountain South 4266870. 0.0193 ## 10 West Pacific 17874256. 0.000481 5.4.2 Totals and sums The survey_total() function is analogous to sum. This can be used to find the estimated aggregate sum of an outcome and should be applied to continuous variables to obtain the estimated total quantity in a population. All the functions introduced henceforth in this chapter must be called from within summarize(). Syntax Let’s explore the syntax: survey_total( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, deff = FALSE, df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples To calculate a population count estimate with survey_total(), the argument x can be left empty as shown in the example below:: recs_des %&gt;% summarize(survey_total()) ## # A tibble: 1 × 2 ## coef `_se` ## &lt;dbl&gt; &lt;dbl&gt; ## 1 118208250. 0.0320 Note that the result from recs_des %&gt;% summarize(survey_total()) is equivalent to the survey_count() call. However, the survey_total() function is called within summarize, where as survey_count(), like dplyr::count(), is not. The difference between survey_total() and survey_count() is more evident when specifying continuous variables to sum. Let’s compute the total cost of electricity in whole dollars from variable DOLLAREL11. We also calculate an unweighted estimate using unweighted(). The unweighted() function calculates unweighted summaries from tbl_svy object which reflects the summary among the respondents and does not extrapolate to a population estimate. recs_des %&gt;% summarize( elec_bill = survey_total(DOLLAREL), elec_unweight = unweighted(sum(DOLLAREL)) ) ## # A tibble: 1 × 3 ## elec_bill elec_bill_se elec_unweight ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 162495237023. 1666895091. 7981888. It is estimated that American residential households spent a total of $162,495,237,022.94540405 on electricity in 2015 and the estimate has a standard error of $1,666,895,091.0112884045. The unweighted function calculates unweighted counts and illustrates the total amount of money the respondents spent on electricity in 2015 which was $7,981,887.75. Since we are using the {srvyr} package, we can use group_by() to calculate the cost of electricity by different groups. Let’s see how much the cost of electricity in whole dollars differed between regions: recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_total(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 28271369286. 717993322. ## 2 Midwest 31527659004. 533527154. ## 3 South 72463508778. 1124624320. ## 4 West 30232699955. 851286976. It’s estimated that households in the Northeast spent $28,271,369,286 on electricity in 2015 while households in the South spent an estimated $72,463,508,778. 5.4.3 Proportions To find estimated proportions in a population, the survey_prop() function should be used. This should be applied to a categorical variable. Syntax survey_prop( vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, proportion = TRUE, prop_method = c(&quot;logit&quot;, &quot;likelihood&quot;, &quot;asin&quot;, &quot;beta&quot;, &quot;mean&quot;, &quot;xlogit&quot;), deff = FALSE, df = NULL ) The arguments are: na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 proportion: an indicator of whether the estimate is a proportion. Defaults to TRUE. Only impacts confidence intervals prop_method: Method to calculate confidence interval for confidence intervals. More details in 5.8.2 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Prior to usage, use the group_by() function to specify the categories of interest. Examples recs_des %&gt;% group_by(Region) %&gt;% summarize(p=survey_prop()) ## When `proportion` is unspecified, `survey_prop()` now defaults to `proportion = TRUE`. ## ℹ This should improve confidence interval coverage. ## This message is displayed once per session. ## # A tibble: 4 × 3 ## Region p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.178 6.84e-11 ## 2 Midwest 0.223 6.07e-11 ## 3 South 0.376 1.43e-10 ## 4 West 0.223 1.37e-10 17.7708% of the households are in the Northeast, 22.3096% in the Midwest, and so on. Note: survey_prop() is essentially the same as using survey_mean() (discussed later) with a categorical variable and without specifying a numeric variable in the x argument. The following code will give us the same results as above: recs_des %&gt;% group_by(Region) %&gt;% summarize(p=survey_mean()) ## # A tibble: 4 × 3 ## Region p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.178 6.84e-11 ## 2 Midwest 0.223 6.07e-11 ## 3 South 0.376 1.43e-10 ## 4 West 0.223 1.37e-10 Getting proportions by more than one variable is possible. In the next example, we look at the proportion of housing units by Region and whether air-conditioning is used (ACUsed).12 recs_des %&gt;% group_by(Region, ACUsed) %&gt;% summarize(p=survey_mean()) ## # A tibble: 8 × 4 ## # Groups: Region [4] ## Region ACUsed p p_se ## &lt;fct&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast FALSE 0.151 0.0200 ## 2 Northeast TRUE 0.849 0.0200 ## 3 Midwest FALSE 0.0766 0.00957 ## 4 Midwest TRUE 0.923 0.00957 ## 5 South FALSE 0.0511 0.00649 ## 6 South TRUE 0.949 0.00649 ## 7 West FALSE 0.301 0.0298 ## 8 West TRUE 0.699 0.0298 When specifying multiple variables, the proportions are conditional. In the results above, notice that the proportions sum to 1 within each region. This can be interpreted as the proportion of housing units that have air conditioning WITHIN each region. If we want the joint proportion instead, the interact function is necessary. In the example below, the interact function is used on Region and ACUsed: recs_des %&gt;% group_by(interact(Region, ACUsed)) %&gt;% summarize(p=survey_mean()) ## # A tibble: 8 × 4 ## Region ACUsed p p_se ## &lt;fct&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast FALSE 0.0268 0.00355 ## 2 Northeast TRUE 0.151 0.00355 ## 3 Midwest FALSE 0.0171 0.00214 ## 4 Midwest TRUE 0.206 0.00214 ## 5 South FALSE 0.0192 0.00244 ## 6 South TRUE 0.357 0.00244 ## 7 West FALSE 0.0672 0.00665 ## 8 West TRUE 0.156 0.00665 5.4.4 Quantiles The survey_quantile() function can be used to find the estimated quantiles of a continuous outcome. For example, we might want estimates of the quartiles of income in a population to understand how the income is distributed. Syntax Let’s explore the syntax: survey_quantile( x, quantiles, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, interval_type = c(&quot;mean&quot;, &quot;beta&quot;, &quot;xlogit&quot;, &quot;asin&quot;, &quot;score&quot;, &quot;quantile&quot;), qrule = c(&quot;math&quot;, &quot;school&quot;, &quot;shahvaish&quot;, &quot;hf1&quot;, &quot;hf2&quot;, &quot;hf3&quot;, &quot;hf4&quot;, &quot;hf5&quot;, &quot;hf6&quot;, &quot;hf7&quot;, &quot;hf8&quot;, &quot;hf9&quot;), df = NULL ) The arguments are: x: a variable, expression, or empty quantiles: A vector of quantiles to calculate na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 interval_type: method for calculating confidence interval. More details in 5.8.2. qrule: rule for defining quantiles. The default is the lower end of the quantile interval (“math”). The midpoint of the quantile interval is the “school” rule. “hf1” to “hf9” are weighted analogues to type=1 to 9 in quantile(). “shahvaish” corresponds to a rule proposed by Shah and Vaish (2006). See vignette(\"qrule\", package=\"survey\") for more information. df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples Quantiles are useful in learning about the distribution of an outcome. Let’s look into the quartiles, specifically, the first quartile (p=0.25), the median (p=0.5) and the third quartile (p=0.75) of electric bills. recs_des %&gt;% summarize(elec_bill = survey_quantile(DOLLAREL, quantiles=c(0.25, .5, 0.75))) ## # A tibble: 1 × 6 ## elec_bill_q25 elec_bill_q50 elec_bill_q75 elec_bill_q25_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 795 1219. 1790. 12.5 ## # ℹ 2 more variables: elec_bill_q50_se &lt;dbl&gt;, elec_bill_q75_se &lt;dbl&gt; In the output above, we see the 3 quartiles and their respective standard errors. We can also estimate the quantiles of electric bills by region as shown below: recs_des %&gt;% group_by(Region) %&gt;% summarize( elec_bill = survey_quantile(DOLLAREL, quantiles=c(0.25, .5, 0.75))) ## # A tibble: 4 × 7 ## Region elec_bill_q25 elec_bill_q50 elec_bill_q75 elec_bill_q25_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 749. 1190. 1741. 19.4 ## 2 Midwest 738. 1067. 1524. 24.2 ## 3 South 1041 1503. 2091. 21.7 ## 4 West 605. 941. 1502. 18.8 ## # ℹ 2 more variables: elec_bill_q50_se &lt;dbl&gt;, elec_bill_q75_se &lt;dbl&gt; While we can specify quantiles of 0 and 1 which represent the minimum and maximum, this is not recommended. It only returns the minimum and maximum of the respondents and cannot be extrapolated to the population as there is no valid definition of standard error. recs_des %&gt;% summarize( elec_bill = survey_quantile(DOLLAREL, quantiles=c(0, 1))) ## # A tibble: 1 × 4 ## elec_bill_q00 elec_bill_q100 elec_bill_q00_se elec_bill_q100_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 21.8 8122. NaN 0 5.5 Measures of central tendency Measures of central tendency find the central (or average) responses. These include the mean, median, and mode. Examples: Average 2000 Academic Performance Index in California, the median house price in Canada The {srvyr} package includes functions for estimating the mean and median, and they must be called within the summarize() function. The survey_mean() function calculates means The survey_median() function calculates medians 5.5.1 Means The survey_mean() function calculate the estimated means of continuous variables of survey data. Syntax Let’s explore the syntax: survey_mean( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, proportion = FALSE, prop_method = c(&quot;logit&quot;, &quot;likelihood&quot;, &quot;asin&quot;, &quot;beta&quot;, &quot;mean&quot;), deff = FALSE, df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 proportion: an indicator of whether the estimate is a proportion. Defaults to FALSE prop_method: Method to calculate confidence interval for confidence intervals. More details in 5.8.2 deff: a logical value stating whether the design effect should be returned, defaults to FALSE df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples We can calculate the estimated average cost of electricity in the U.S. and then for each region in the U.S.: recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL)) ## # A tibble: 1 × 2 ## elec_bill elec_bill_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 14.1 recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_mean(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 Nationally, the average household spent $1,375 in 2015 with some variability by region. Households from the West spent $1,146 on electricity and in the South, they spent an average of $1,631. 5.5.2 Medians The median is another measure of central tendency which provides an estimate of the midpoint of a continuous distribution. Medians are less subject to outliers than means. Syntax The syntax is nearly identical to survey_quantile() as the median is a special quantile with p=0.5. survey_median( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, interval_type = c(&quot;mean&quot;, &quot;beta&quot;, &quot;xlogit&quot;, &quot;asin&quot;, &quot;score&quot;, &quot;quantile&quot;), qrule = c(&quot;math&quot;, &quot;school&quot;, &quot;shahvaish&quot;, &quot;hf1&quot;, &quot;hf2&quot;, &quot;hf3&quot;, &quot;hf4&quot;, &quot;hf5&quot;, &quot;hf6&quot;, &quot;hf7&quot;, &quot;hf8&quot;, &quot;hf9&quot;), df = NULL ) The arguments are: x: a variable, expression, or empty na.rm: an indicator of whether missing values should be dropped, defaults to FALSE vartype: type(s) of variation estimate to calculate, defaults to se (standard error) level: a number or a vector indicating the confidence level, defaults to 0.95 interval_type: method for calculating confidence interval. More details in 5.8.2. qrule: rule for defining quantiles. The default is the lower end of the quantile interval (“math”). The midpoint of the quantile interval is the “school” rule. “hf1” to “hf9” are weighted analogues to type=1 to 9 in quantile(). “shahvaish” corresponds to a rule proposed by Shah and Vaish (2006). See vignette(\"qrule\", package=\"survey\") for more information. df: (for vartype = 'ci'), a numeric value indicating degrees of freedom for the t-distribution Examples We can calculate the estimated median cost of electricity in the U.S. and then for each region in the U.S.: recs_des %&gt;% summarize(elec_bill = survey_median(DOLLAREL)) ## # A tibble: 1 × 2 ## elec_bill elec_bill_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 1219. 17.3 recs_des %&gt;% group_by(Region) %&gt;% summarize(elec_bill = survey_median(DOLLAREL)) ## # A tibble: 4 × 3 ## Region elec_bill elec_bill_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1190. 36.3 ## 2 Midwest 1067. 28.0 ## 3 South 1503. 28.0 ## 4 West 941. 28.8 Nationally, the median household spent $1,219 in 2015 with some variability by region. Households from the West spent $941 on electricity and in the South, they spent an average of $1,503. Note that the 50th percentile and the median are the same, as expected. The average electric bill for households was $1,375 but the estimated median electric bill is $1,219 indicating the distribution is likely right skewed. 5.6 Measures of dispersion Measures of dispersion describe how data spreads around the central tendency for continuous variables. These measures include the standard deviation, variance, and range. Examples: The standard deviation of the 2000 Academic Performance Index in California, the variance of electricity expenditure in Ohio The survey_var() function calculates variances The survey_sd() function calculates standard deviations It should be noted these are estimates of the population variance and population standard deviation. These are not standard errors of another estimate. In our experience, these are sometimes used when designing a future study as understanding the variability in the population can help inform the precision of a future sampling design. 5.6.1 Standard deviation and variance The standard deviation estimate is simply the square root of the variance estimate and thus the functions have the same arguments except the standard deviation does not allow the usage of vartype. Syntax The syntax is as follows: survey_var( x, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;), level = 0.95, df = NULL ) survey_sd( x, na.rm = FALSE ) The arguments are: x: A variable or expression, or empty na.rm: A logical value to indicate whether missing values should be dropped vartype: Report variability as one or more of: standard error (“se”, default) or variance (“var”) (confidence intervals and coefficient of variation not available). level: (For vartype = “ci” only) A single number or vector of numbers indicating the confidence level. df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples Returning to electricity bills, we look at the amount of variability in electricity expenditure. recs_des %&gt;% summarize( var_elbill = survey_var(DOLLAREL), sd_elbill = survey_sd(DOLLAREL) ) ## Warning: There were 2 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `var_elbill = survey_var(DOLLAREL)`. ## Caused by warning in `thetas - meantheta`: ## ! Recycling array of length 1 in vector-array arithmetic is deprecated. ## Use c() or as.vector() instead. ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. ## # A tibble: 1 × 3 ## var_elbill var_elbill_se sd_elbill ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 647540. 27163. 805. A warning message may be displayed if using a replicate design. Your results are still valid. The results above give an estimate of the population variance of electricity bills (var_elbill), the standard error of that variance (var_elbill_se), and the estimated population standard deviation of electricity bills. Note that there is no standard error associated with the standard deviation - this is the only estimate that does not include a standard error. Like other estimates, we can calculate the variance by region. This would be useful to learn if the variability is similar across regions: recs_des %&gt;% group_by(Region) %&gt;% summarize( var_elbill = survey_var(DOLLAREL), sd_elbill = survey_sd(DOLLAREL) ) ## Warning: There were 8 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `var_elbill = survey_var(DOLLAREL)`. ## ℹ In group 1: `Region = Northeast`. ## Caused by warning in `thetas - meantheta`: ## ! Recycling array of length 1 in vector-array arithmetic is deprecated. ## Use c() or as.vector() instead. ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings. ## # A tibble: 4 × 4 ## Region var_elbill var_elbill_se sd_elbill ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 714269. 54305. 845. ## 2 Midwest 410548. 26708. 641. ## 3 South 682280. 54320. 826. ## 4 West 578803. 39280. 761. 5.7 Measures of relationship Measures of relationship describe how variables relate to each other. In this section, we present two measures - the Pearson’s correlation (simply referred to correlation henceforth) and the ratio. Examples: Correlation between house square footage and electricity expenditure. Ratio of number of diseased teeth to total teeth. The survey_corr() function calculates the Pearson’s correlation between two variables The survey_ratio() function calculates ratio between two variables 5.7.1 Correlations The correlation is a measure of linear relationship between two continuous variables which ranges between -1 and 1. A sample correlation for a simple random sample is calculated as: \\[\\frac{\\sum (x_i-\\bar{x})(y_i-\\bar{y})}{\\sqrt{\\sum (x_i-\\bar{x})^2} \\sqrt{\\sum(y_i-\\bar{y})^2}} \\] When using survey_corr(), for designs other than a simple random sample, the weights are applied when estimating the correlation. Syntax The syntax for survey_corr() is as follows: survey_corr( x, y, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, df = NULL ) The arguments are: x: A variable or expression y: A variable or expression na.rm: A logical value to indicate whether missing values should be dropped vartype: NULL to report no variability. Otherwise one or more of: standard error (“se”, the default), confidence interval (“ci”), variance (“var”) or coefficient of variation (“cv”). level: (For vartype = “ci” only) A single number or vector of numbers indicating the confidence level df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples We can calculate the correlation between total square footage (TOTSQFT_EN)13 and electricity consumption (BTUEL)14. recs_des %&gt;% summarize( SQFT_Elec_Corr=survey_corr(TOTSQFT_EN, BTUEL) ) ## Warning: There was 1 warning in `dplyr::summarise()`. ## ℹ In argument: `SQFT_Elec_Corr = survey_corr(TOTSQFT_EN, BTUEL)`. ## Caused by warning in `sweep()`: ## ! length(STATS) or dim(STATS) do not match dim(x)[MARGIN] ## # A tibble: 1 × 2 ## SQFT_Elec_Corr SQFT_Elec_Corr_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 0.367 0.0172 Like with other statistics, we can do this by another variable. For example, we can examine the correlation by whether air-conditioning is used (ACUsed). recs_des %&gt;% group_by(ACUsed) %&gt;% summarize( SQFT_Elec_Corr=survey_corr(TOTSQFT_EN, DOLLAREL) ) ## Warning: There were 2 warnings in `dplyr::summarise()`. ## The first warning was: ## ℹ In argument: `SQFT_Elec_Corr = survey_corr(TOTSQFT_EN, DOLLAREL)`. ## ℹ In group 1: `ACUsed = FALSE`. ## Caused by warning in `sweep()`: ## ! length(STATS) or dim(STATS) do not match dim(x)[MARGIN] ## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning. ## # A tibble: 2 × 3 ## ACUsed SQFT_Elec_Corr SQFT_Elec_Corr_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.327 0.0468 ## 2 TRUE 0.384 0.0189 5.7.2 Ratios The ratio estimate is one many are not as familiar with. The ratio is a measure of the ratio of the sum of two variables, specifically in the form of: \\[ \\frac{\\sum x_i}{\\sum y_i}.\\] The ratio is not the same as calculating the following: \\[ \\frac{1}{N} \\sum \\frac{x_i}{y_i} \\] which could be calculated with survey_mean() by creating a derived variable \\(z=x/y\\) and then calculating the mean of \\(z\\). Consider a survey of police agencies in the United States. We might want to estimate the ratio of female police officers to total police officers. This could be done with survey_ratio(Female_Officers, Total_Officers). If instead, we used survey_means(Female_Officers/Total_Officers), we would be estimating the average percentage of female officers across agencies which is a different quantity. Syntax The syntax for survey_corr() is as follows: survey_ratio( numerator, denominator, na.rm = FALSE, vartype = c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;), level = 0.95, deff = FALSE, df = NULL ) The arguments are: numerator: The numerator of the ratio denominator: The denominator of the ratio na.rm: A logical value to indicate whether missing values should be dropped vartype: Report variability as one or more of: standard error (“se”, default), confidence interval (“ci”), variance (“var”) or coefficient of variation (“cv”). level: A single number or vector of numbers indicating the confidence level deff: A logical value to indicate whether the design effect should be returned df: (For vartype = “ci” only) A numeric value indicating the degrees of freedom for t-distribution Examples Suppose we wanted to find the ratio of dollars spent on liquid propane per unit (in British thermal unit [Btu]) nationally. If we wanted to find the average cost to a household, we could use survey_mean() but to find the national unit rate, we can use ratio. In the following example, we will show both methods and discuss the interpretation of each: recs_des %&gt;% summarize( DOLLARLP_Tot=survey_total(DOLLARLP), BTULP_Tot=survey_total(BTULP), DOL_BTU_Rat=survey_ratio(DOLLARLP, BTULP), DOL_BTU_Avg=survey_mean(DOLLARLP/BTULP, na.rm=TRUE), ) ## # A tibble: 1 × 8 ## DOLLARLP_Tot DOLLARLP_Tot_se BTULP_Tot BTULP_Tot_se DOL_BTU_Rat ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 7759788903. 890352681. 360915832595. 44956961024. 0.0215 ## # ℹ 3 more variables: DOL_BTU_Rat_se &lt;dbl&gt;, DOL_BTU_Avg &lt;dbl&gt;, ## # DOL_BTU_Avg_se &lt;dbl&gt; In the output above, the ratio of the total spent on liquid propane to the total consumption was 0.0215 but the average rate was 0.0215. With a little calculation, it can be shown that the ratio is the ratio of the totals DOLLARLP_Tot/BTULP_Tot=7,759,788,903/360,915,832,595=0.0215. While the ratio could be calculated manually in this manner, the standard error requires usage of the survey_ratio() function. The average can be interpreted as the average rate a household pays. As previously done, we can use group_by() to examine whether this rate varies by region. recs_des %&gt;% group_by(Region) %&gt;% summarize( DOL_BTU_Rat=survey_ratio(DOLLARLP, BTULP), ) ## # A tibble: 4 × 3 ## Region DOL_BTU_Rat DOL_BTU_Rat_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 0.0263 0.000888 ## 2 Midwest 0.0167 0.000371 ## 3 South 0.0218 0.000986 ## 4 West 0.0232 0.000912 Though not a statistical test, it does appear the cost rates in the Midwest for liquid propane are the lowest. 5.8 Additional topics 5.8.1 Subpopulation analysis Briefly, we mentioned using filter() to subset a survey object for analysis. This operation should be done after creating the design object. In rare circumstances, subsetting data before creating the object can lead to incorrect variability estimates. This can occur if subsetting removes an entire PSU. Suppose, we wanted estimates of the average amount spent on natural gas among housing units that use natural gas. This could be obtained by first filtering records to only include records where BTUNG&gt;0 and then finding the average amount of money spent. recs_des %&gt;% filter(BTUNG&gt;0) %&gt;% summarize( NG_mean=survey_mean(DOLLARNG, vartype = c(&quot;se&quot;, &quot;ci&quot;)) ) ## # A tibble: 1 × 4 ## NG_mean NG_mean_se NG_mean_low NG_mean_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 601. 9.49 582. 620. Note that this yields a higher mean than when not applying the filter. When including housing units that do not use natural gas, many $0 amounts are included in the mean calculation. recs_des %&gt;% summarize( NG_mean=survey_mean(DOLLARNG, vartype = c(&quot;se&quot;, &quot;ci&quot;)) ) ## # A tibble: 1 × 4 ## NG_mean NG_mean_se NG_mean_low NG_mean_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 349. 9.04 331. 367. 5.8.2 Variability types In all of the functions discussed above (except survey_sd()), measures of variability can be calculated. The default variability measure is the standard error (se). There are 4 types of variability measures that can be calculated which are: se: standard error The estimated standard deviation of the estimate Output has a column with the variable name specified in summarize() with a suffix of “_se” ci: confidence interval The lower and upper limits of a confidence interval Output has a column with the variable name specified in summarize() with a suffix of “_low” and “_upp” By default, this is a 95% confidence interval but can be changed by using the argument level and specifying a number between 0 and 1. For example, level=0.8 would produce a 80% confidence interval var: variance The estimated variance of the estimate Output has a column with the variable name specified in summarize() with a suffix of “_var” cv: coefficient of variation A ratio of the standard error and the estimate Output has a column with the variable name specified in summarize() with a suffix of “_cv” Not an option available for survey_var() We will return to the example of calculating the average electricity bill but include all the variability type options. Multiple options can be used at once. recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=c(&quot;se&quot;, &quot;ci&quot;, &quot;var&quot;, &quot;cv&quot;))) ## # A tibble: 1 × 6 ## elec_bill elec_bill_se elec_bill_low elec_bill_upp elec_bill_var ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 14.1 1347. 1403. 199. ## # ℹ 1 more variable: elec_bill_cv &lt;dbl&gt; It is also possible to not return any variability estimate by specifying vartype as NULL. recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=NULL)) ## # A tibble: 1 × 1 ## elec_bill ## &lt;dbl&gt; ## 1 1375. In the example below, an 80% confidence interval rather than the deafult 95% confidence interval is calculated recs_des %&gt;% summarize(elec_bill = survey_mean(DOLLAREL, vartype=&quot;ci&quot;, level=0.8)) ## # A tibble: 1 × 3 ## elec_bill elec_bill_low elec_bill_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1375. 1356. 1393. For means, counts, totals, ratios, correlations, and variances, the confidence intervals are always calculated using a symmetric t-distribution based confidence interval as follows: \\[ \\text{estimate} \\pm t^*_{df}\\times SE\\] where \\(t^*_{df}\\) is the critical value from a t-distribution based on the confidence level and the degrees of freedom. By default the degrees of freedom are calculated based off of the design or number of replicates but they can be specified using the argument df. For survey design objects, the degrees of freedom are calculated as the number of PSUs minus the number of strata. For replicate based objects, the degrees of freedom are calculated as one less than the rank of the matrix of replicate weight where the number of replicates is typically the rank. Note that specifying df=Inf is equivalent to using a normal (z-based) confidence interval. There are more options for calculating confidence intervals for proportions and quantiles (including medians) which are further described. Confidence intervals for proportions The interval above is referred to as a Wald-type interval. While a Wald-type interval using a symmetric t-based confidence interval is an option, this does not generally have the correct coverage rate when sample sizes are small and/or the proportion is “near” 0 or 1. Thus, other methods have been developed to calculate confidence intervals and can be specified using the prop_method option in survey_prop(). The options include: logit: fits a logistic regression model and computes a Wald-type interval on the log-odds scale which is then transformed to the probability scale. This is the default method. likelihood: uses the (Rao-Scott) scaled chi-squared distribution for the log-likelihood from a binomial distribution. asin: uses the variance-stabilizing transformation for the binomial distribution, the arcsine square root, and then back-transforms the interval to the probability scale beta: uses the incomplete beta function with an effective sample size based on the estimated variance of the proportion. mean: the Wald-type interval xlogit: uses a logit transformation of the proportion, calculates a Wald-type interval, and then back-transforms to the probability scale. This method is implemented in SUDAAN and SPSS. In the example below, a logical derived variable is created for whether someone voted for a candidate that was neither Trump nor Biden in the 2020 presidential election. Then, a confidence interval is calculated for that proportion using the various methods. anes_des_sel &lt;- anes_des %&gt;% mutate(VoteOther=(VotedPres2020_selection==&quot;Other&quot;)) %&gt;% group_by(VoteOther) ci_p_vototh &lt;- anes_des_sel %&gt;% summarize( p_logit=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;logit&quot;), p_like=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;likelihood&quot;), p_asin=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;asin&quot;), p_beta=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;beta&quot;), p_mean=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;mean&quot;), p_xlogit=survey_prop(vartype=&quot;ci&quot;, prop_method=&quot;xlogit&quot;) ) %&gt;% filter(VoteOther) ci_p_vototh %&gt;% select(ends_with(&quot;low&quot;)) ## # A tibble: 1 × 6 ## p_logit_low p_like_low p_asin_low p_beta_low p_mean_low p_xlogit_low ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.0144 0.0143 0.0142 0.0142 0.0139 0.0144 ci_p_vototh %&gt;% select(ends_with(&quot;upp&quot;)) ## # A tibble: 1 × 6 ## p_logit_upp p_like_upp p_asin_upp p_beta_upp p_mean_upp p_xlogit_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.0230 0.0228 0.0227 0.0230 0.0224 0.0230 Confidence intervals for quantiles Like confidence intervals for proportions, there are several methods for calculating confidence intervals of quantiles which includes medians. The methods for interval types are many of the same as those for proportions (asin, beta, mean, and xlogit) with the addition of two more methods including: score: the Francisco &amp; Fuller confidence interval based on inverting a score test (only available for design-based survey objects and not replicate based objects) quantile: based on the replicates of the quantile. Not valid for jackknife-type replicates but available for bootstrap and BRR replicates. In the example below the confidence interval for the 95th percentile of electricity expenditure is calculated using the various methods. ci_elbill_p90 &lt;- recs_des %&gt;% summarize( p90_asin=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;asin&quot;), p90_beta=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;beta&quot;), p90_mean=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;mean&quot;), p90_xlog=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;xlogit&quot;), p90_quant=survey_quantile(DOLLAREL, .9, vartype=&quot;ci&quot;, interval_type = &quot;quantile&quot;) ) ci_elbill_p90 %&gt;% select(ends_with(&quot;low&quot;)) ## # A tibble: 1 × 5 ## p90_asin_q90_low p90_beta_q90_low p90_mean_q90_low p90_xlog_q90_low ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 2331. 2327. 2336. 2327. ## # ℹ 1 more variable: p90_quant_q90_low &lt;dbl&gt; ci_elbill_p90 %&gt;% select(ends_with(&quot;upp&quot;)) ## # A tibble: 1 × 5 ## p90_asin_q90_upp p90_beta_q90_upp p90_mean_q90_upp p90_xlog_q90_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 2470 2470 2473. 2469. ## # ℹ 1 more variable: p90_quant_q90_upp &lt;dbl&gt; In the example below, the mean and score type confidence interval are calculated for the median age of the voting age population. anes_des %&gt;% summarize( am=survey_median(Age, vartype=&quot;ci&quot;, interval_type=&quot;mean&quot;, na.rm=TRUE), as=survey_median(Age, vartype=&quot;ci&quot;, interval_type=&quot;score&quot;, na.rm=TRUE), ) ## # A tibble: 1 × 6 ## am am_low am_upp as as_low as_upp ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 47 46 49 47 40 42 Note that in instances when there are many ties in the data, the score method can produce confidence intervals that do not contain the estimate as is shown above. The documentation in the {survey} package indicates this method has lower performance than the beta and logit intervals. This is the method implemented in SUDAAN though SUDAAN now adds noise to the values to prevent the issue with the ties. 5.8.3 Design effects The design effect measures how the precision of an estimate is impacted by the sampling design. A design effect is calculated as the ratio of the variance of an estimate under the design at hand to the variance of the estimate under a simple random sample without replacement (SRS). A design effect less than 1 indicates that the design is more statisticially efficient than a SRS design. This is rare but possible in a stratified sampling design where the outcome is correlated with the stratification variable(s). A design effect greater than 1 indicates that the design is less statistically effecient than a SRS design. From a design effect, we can calculate the effective sample size as follows: \\[n_{eff}=\\frac{n}{D_{eff}} \\] where \\(n\\) is the nominal sample size (number of survey responses) and \\(D_{eff}\\) is the estimated design effect. The effective sample size has an interesting interpretation that a survey using a SRS design would need a sample size of \\(n_{eff}\\) to obtain the same precision as the design at hand which is where the efficiency interpretation comes in. Design effects are outcome specific - outcomes that are less clustered in the population have smaller design effects than outcomes which are clustered. In the {srvyr} package, design effects can be calculated for totals, proportions, means, and ratio estimates by setting the deff argument to TRUE in the corresponding functions. Example In the example below, the design effect is calculated for the means the consumption of electricity (BTUEL), natural gas (BTUNG), liquid propane (BTULP), fuel oil (BTUFO), wood (BTUWOOD), and wood pellets (BTUPELLET). recs_des %&gt;% summarize( across(c(BTUEL, BTUNG, BTULP, BTUFO, BTUWOOD, BTUPELLET), ~survey_mean(.x, deff=TRUE, vartype=NULL)) ) %&gt;% select(ends_with(&quot;deff&quot;)) ## # A tibble: 1 × 6 ## BTUEL_deff BTUNG_deff BTULP_deff BTUFO_deff BTUWOOD_deff ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1.59 2.39 4.37 1.75 2.17 ## # ℹ 1 more variable: BTUPELLET_deff &lt;dbl&gt; 5.8.4 Creating summary rows When using group_by() in analysis, results are returned with a row for each group or group combination. Often, we want a summary row for the estimate for the entire population. For example, we may want the average electricity consumption by region AND nationally. The {srvyr} package has a function cascade() which adds summary rows for the total of a group. It is used in place of summarize and has some additional features. Syntax The syntax is as follows: cascade( .data, ..., .fill = NA, .fill_level_top = FALSE, .groupings = NULL ) where the arguments are: .data: A tbl_svy object ...: Name-value pairs of summary functions .fill: Value to fill in for group summaries (defaults to NA) .fill_level_top: When filling factor variables, whether to put the value ‘.fill’ in the first position (defaults to FALSE, placing it in the bottom). .groupings: (Experimental) A list of lists of quosures to manually specify the groupings to use, rather than the default. Examples First, let’s take a look at a simple example and then build on it to examine the features of the function. In the first example, all default values are used. recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL)) ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 ## 5 &lt;NA&gt; 1375. 14.1 The last row where Region=NA is the national average electricity bill. We might wish to have a better name for it and can do that using the .fill argument. recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;National&quot;) ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 1346. 34.2 ## 2 Midwest 1196. 20.2 ## 3 South 1631. 25.3 ## 4 West 1146. 32.3 ## 5 National 1375. 14.1 We can also have more than one grouping variable as follows: recs_des %&gt;% group_by(Region, Urbanicity) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;Total&quot;) %&gt;% ungroup() ## # A tibble: 17 × 4 ## Region Urbanicity DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast Urban Area 1350. 30.8 ## 2 Northeast Urban Cluster 1123. 113. ## 3 Northeast Rural 1473. 68.6 ## 4 Northeast Total 1346. 34.2 ## 5 Midwest Urban Area 1062. 28.6 ## 6 Midwest Urban Cluster 1163. 64.3 ## 7 Midwest Rural 1623. 54.0 ## 8 Midwest Total 1196. 20.2 ## 9 South Urban Area 1581. 34.3 ## 10 South Urban Cluster 1377. 59.7 ## 11 South Rural 1879. 65.8 ## 12 South Total 1631. 25.3 ## 13 West Urban Area 1110. 34.6 ## 14 West Urban Cluster 1134. 69.7 ## 15 West Rural 1363. 83.0 ## 16 West Total 1146. 32.3 ## 17 Total Total 1375. 14.1 We can move the summary row to being the first row: recs_des %&gt;% group_by(Region) %&gt;% cascade(DOLLAREL_mn=survey_mean(DOLLAREL), .fill=&quot;National&quot;, .fill_level_top = TRUE) %&gt;% ungroup() ## # A tibble: 5 × 3 ## Region DOLLAREL_mn DOLLAREL_mn_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 National 1375. 14.1 ## 2 Northeast 1346. 34.2 ## 3 Midwest 1196. 20.2 ## 4 South 1631. 25.3 ## 5 West 1146. 32.3 5.8.5 Calculating estimates for many outcomes Often, we are interested in a summary statistic across many variables. Two useful tools in doing this are the across() function in {dplyr} which has been shown a few times above and the map() function in {purrr}. The across() function allows you to apply the same function to several columns within summarize(). This works well for usage with all functions shown above except survey_prop(). In a later example, we will tackle several proportions. across() Example 1 Suppose, we want to calculate the total consumption for each fuel type and the average consumption for each fuel type with coefficients of variation. These include the consumption of electricity (BTUEL), natural gas (BTUNG), liquid propane (BTULP), fuel oil (BTUFO), wood (BTUWOOD), and wood pellets (BTUPELLET) as illustrated in the discussion on design effects. These are the only variables that start with “BTU” so we can use that to our advantage. consumption_ests &lt;- recs_des %&gt;% summarize( across(starts_with(&quot;BTU&quot;), list(tot=~survey_total(.x, vartype=&quot;cv&quot;), mn=~survey_mean(.x, vartype=&quot;cv&quot;))) ) consumption_ests ## # A tibble: 1 × 24 ## BTUEL_tot BTUEL_tot_cv BTUEL_mn BTUEL_mn_cv BTUNG_tot BTUNG_tot_cv ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 4.32e12 0.0108 36578. 0.0108 3.96e12 0.0255 ## # ℹ 18 more variables: BTUNG_mn &lt;dbl&gt;, BTUNG_mn_cv &lt;dbl&gt;, ## # BTULP_tot &lt;dbl&gt;, BTULP_tot_cv &lt;dbl&gt;, BTULP_mn &lt;dbl&gt;, ## # BTULP_mn_cv &lt;dbl&gt;, BTUFO_tot &lt;dbl&gt;, BTUFO_tot_cv &lt;dbl&gt;, ## # BTUFO_mn &lt;dbl&gt;, BTUFO_mn_cv &lt;dbl&gt;, BTUWOOD_tot &lt;dbl&gt;, ## # BTUWOOD_tot_cv &lt;dbl&gt;, BTUWOOD_mn &lt;dbl&gt;, BTUWOOD_mn_cv &lt;dbl&gt;, ## # BTUPELLET_tot &lt;dbl&gt;, BTUPELLET_tot_cv &lt;dbl&gt;, BTUPELLET_mn &lt;dbl&gt;, ## # BTUPELLET_mn_cv &lt;dbl&gt; In the example above, this results in a very wide table. We may instead want a row for each fuel type. Using the pivot_longer(), separate_wider_delim(), and pivot_wider() functions from {tidyr} can help us get there. We will first make the data much longer. First we make the table longer: consumption_ests_long &lt;- consumption_ests %&gt;% pivot_longer(everything()) consumption_ests_long ## # A tibble: 24 × 2 ## name value ## &lt;chr&gt; &lt;dbl&gt; ## 1 BTUEL_tot 4.32e+12 ## 2 BTUEL_tot_cv 1.08e- 2 ## 3 BTUEL_mn 3.66e+ 4 ## 4 BTUEL_mn_cv 1.08e- 2 ## 5 BTUNG_tot 3.96e+12 ## 6 BTUNG_tot_cv 2.55e- 2 ## 7 BTUNG_mn 3.35e+ 4 ## 8 BTUNG_mn_cv 2.55e- 2 ## 9 BTULP_tot 3.61e+11 ## 10 BTULP_tot_cv 1.25e- 1 ## # ℹ 14 more rows Then we separate out the name column to separate columns for naming later: consumption_ests_sep &lt;- consumption_ests_long %&gt;% separate_wider_delim(name, names=c(&quot;FuelType&quot;, &quot;EstType&quot;, &quot;EstType2&quot;), delim=&quot;_&quot;, too_few = &quot;align_start&quot;) consumption_ests_sep ## # A tibble: 24 × 4 ## FuelType EstType EstType2 value ## &lt;chr&gt; &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt; ## 1 BTUEL tot &lt;NA&gt; 4.32e+12 ## 2 BTUEL tot cv 1.08e- 2 ## 3 BTUEL mn &lt;NA&gt; 3.66e+ 4 ## 4 BTUEL mn cv 1.08e- 2 ## 5 BTUNG tot &lt;NA&gt; 3.96e+12 ## 6 BTUNG tot cv 2.55e- 2 ## 7 BTUNG mn &lt;NA&gt; 3.35e+ 4 ## 8 BTUNG mn cv 2.55e- 2 ## 9 BTULP tot &lt;NA&gt; 3.61e+11 ## 10 BTULP tot cv 1.25e- 1 ## # ℹ 14 more rows Then we create what will become column names and pivot wider to create a table that is almost ready for a publication. A bit more on that will be covered in Chapter 8. consumption_ests_sep %&gt;% mutate(ColName=case_when( EstType==&quot;tot&quot;&amp;is.na(EstType2)~&quot;Total&quot;, EstType==&quot;tot&quot;&amp;!is.na(EstType2)~&quot;Total (CV)&quot;, EstType==&quot;mn&quot;&amp;is.na(EstType2)~&quot;Mean&quot;, EstType==&quot;mn&quot;&amp;!is.na(EstType2)~&quot;Mean (CV)&quot;, )) %&gt;% pivot_wider(id_cols=FuelType, names_from=ColName, values_from = value) %&gt;% mutate(FuelType=str_remove(FuelType, &quot;BTU&quot;)) ## # A tibble: 6 × 5 ## FuelType Total `Total (CV)` Mean `Mean (CV)` ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 EL 4.32e12 0.0108 36578. 0.0108 ## 2 NG 3.96e12 0.0255 33541. 0.0255 ## 3 LP 3.61e11 0.125 3053. 0.125 ## 4 FO 4.64e11 0.0853 3927. 0.0853 ## 5 WOOD 4.92e11 0.0908 4159. 0.0908 ## 6 PELLET 2.11e10 0.242 179. 0.242 across() Example 2 As mentioned earlier, proportions will not work as well directly with the across method. If you wanted the proportion of houses with air conditioning and the proportion of houses with heating, you might imagine needing two group_by() statements as follows: recs_des %&gt;% group_by(ACUsed) %&gt;% summarise(p=survey_prop()) ## # A tibble: 2 × 3 ## ACUsed p p_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.130 0.00826 ## 2 TRUE 0.870 0.00826 recs_des %&gt;% group_by(SpaceHeatingUsed) %&gt;% summarise(p=survey_prop()) ## # A tibble: 2 × 3 ## SpaceHeatingUsed p p_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 0.0435 0.00370 ## 2 TRUE 0.957 0.00370 If you are only interested in the TRUE outcomes, that is the proportion that do have air conditioning and the proportion that have heating, you can use the fact that survey_mean() applied to a logical variable is the same as using survey_prop() as is shown below: cool_heat_tab &lt;- recs_des %&gt;% summarise(across(c(ACUsed, SpaceHeatingUsed), ~survey_mean(.x))) cool_heat_tab ## # A tibble: 1 × 4 ## ACUsed ACUsed_se SpaceHeatingUsed SpaceHeatingUsed_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.870 0.00826 0.957 0.00370 Note the estimates are the same as when using the separate group_by() statements. Similar to previously done, we can use a combination of pivot_longer() and pivot_wider() to create a table in a format better suited for distribution. cool_heat_tab %&gt;% pivot_longer(everything()) %&gt;% separate_wider_delim(name, names=c(&quot;Comfort&quot;, &quot;EstType&quot;), delim=&quot;_&quot;, too_few = &quot;align_start&quot;) %&gt;% mutate(EstType=if_else(is.na(EstType), &quot;p&quot;, EstType)) %&gt;% pivot_wider(names_from=EstType, values_from=value) ## # A tibble: 2 × 3 ## Comfort p se ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 ACUsed 0.870 0.00826 ## 2 SpaceHeatingUsed 0.957 0.00370 map example If you want to calculate something again and again, loops are a common tool. The {purrr} package has the map() functions which like a loop allows you to do something in the same way many times. In our case, we want to calculate proportions from the same design multiple times. We find an easy way to do this is to think about how you would do it for one outcome, build a function from there, and then iterate. Suppose, we want to create a table that shows the proportion of people that trust in their government (TrustGovernment)15 as well as those that trust in people (TrustPeople)16. In the example below, we create a table that has the variable name as a column, the answer as a column, and then the percentage and its standard error. anes_des %&gt;% drop_na(TrustGovernment) %&gt;% group_by(TrustGovernment) %&gt;% summarise(p=survey_prop()*100) %&gt;% mutate(Variable=&quot;TrustGovernment&quot;) %&gt;% rename(Answer=TrustGovernment) %&gt;% select(Variable, everything()) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 To turn this into a function, we need to use a bit of tidy evaluation which is a more advanced skill. If you want to learn more, we recommend Wickham (2019). calcps &lt;- function(var){ anes_des %&gt;% drop_na(!!sym(var)) %&gt;% group_by(!!sym(var)) %&gt;% summarise(p=survey_prop()*100) %&gt;% mutate(Variable=var) %&gt;% rename(Answer:=!!sym(var)) %&gt;% select(Variable, everything()) } calcps(&quot;TrustGovernment&quot;) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 calcps(&quot;TrustPeople&quot;) ## # A tibble: 5 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustPeople Always 0.809 0.164 ## 2 TrustPeople Most of the time 41.4 0.857 ## 3 TrustPeople About half the time 28.2 0.776 ## 4 TrustPeople Some of the time 24.5 0.670 ## 5 TrustPeople Never 5.05 0.422 Now, we can use map to iterate over as many variables as we want. It will output a data.frame with the variable name in the column “Variable”, the responses in “Answer”, the percentage and then the standard error. Finally, we can now use map to do this iteratively. This example extends nicely if you have many variables for which you want the percentage estimate. c(&quot;TrustGovernment&quot;, &quot;TrustPeople&quot;) %&gt;% map(calcps) %&gt;% list_rbind() ## # A tibble: 10 × 4 ## Variable Answer p p_se ## &lt;chr&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 TrustGovernment Always 1.55 0.204 ## 2 TrustGovernment Most of the time 13.2 0.553 ## 3 TrustGovernment About half the time 30.9 0.829 ## 4 TrustGovernment Some of the time 43.4 0.855 ## 5 TrustGovernment Never 11.0 0.566 ## 6 TrustPeople Always 0.809 0.164 ## 7 TrustPeople Most of the time 41.4 0.857 ## 8 TrustPeople About half the time 28.2 0.776 ## 9 TrustPeople Some of the time 24.5 0.670 ## 10 TrustPeople Never 5.05 0.422 5.9 Exercises The first set of exercises should use the design object anes_des created in the book earlier. How many females have a graduate degree? Hint: the variables Gender and Education will be useful. What percentage of people identify as “Strong Democrat”? Hint: The variable PartyID indicates what party people identify with. What percentage of people who voted in the 2020 election identify as “Strong Republican”? Hint: The variable VotedPres2020 indicates whether someone voted in 2020. What percentage of people voted in both the 2016 election and in the 2020 election? Include the logit confidence interval. Hint: The variable VotedPres2016 indicates whether someone voted in 2016. What is the design effect for the proportion of people who voted early? Hint: The variable EarlyVote2020 indicates whether someone voted early in 2020. What is the average temperature that people set their thermostats to at night during the winter? Hint: The variable WinterTempNight indicates the temperature that people set their temperature in the winter at night. People do not always set their temperature the same over different seasons and during the day. What are the median temperatures that people set their thermostat to in the summer and winter both during the day and during the night? Include confidence intervals. Hint: Use the variables WinterTempDay, WinterTempNight, SummerTempDay, and SummerTempNight. What is the correlation between the temperature that people set their temperature at during the night and during the day in the summer? What is the 1st, 2nd, and 3rd quartile of the amount of money spent on energy by Building America (BA) climate zone? Hint: TOTALDOL indicates the total amount spent on electricity and ClimateRegion_BA indicates the BA climate zones References "],["c06-statistical-testing.html", "Chapter 6 Statistical testing 6.1 Introduction 6.2 Comparison of Proportions and Means 6.3 Chi-Square Tests 6.4 Exercises", " Chapter 6 Statistical testing Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(gt) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 6.1 Introduction When analyzing results from a survey, the point estimates described in Chapter 5 help us understand the data at a high level. Still, researchers and the public often want to make comparisons between different groups. These comparisons are calculated through statistical testing. The general idea of statistical testing is the same for data obtained through surveys and data obtained through other methods, where we compare the point estimates and variance estimates of each statistic to see if statistically significant differences exist. However, statistical testing for complex surveys involves additional considerations due to the need to account for the sampling design in order to obtain accurate variance estimates. The functions in the {survey} packages allow for the correct estimation of the variances. This chapter will cover the following statistical tests with survey data and functions: Comparison of proportions svyttest() Comparison of means svyttest() Goodness of fit tests svygofchisq() Tests of independence svychisq() Tests of homogeneity svychisq() Up to this point, we’ve shown functions that use wrappers from the {srvyr} package. This means that the functions work with tidyverse syntax. However, the functions in this chapter do not have wrappers in the {srvyr} package and are instead used directly from the {survey} package. Therefore, the design object is not the first argument and to use these functions with the magrittr pipe %&gt;% and tidyverse syntax, we will need to use dot (.) notation. Functions that work with the magrittr pipe %&gt;% have the data as the first argument. When we run a function with the pipe, it automatically places anything to the left of the pipe into the first argument of the function to the right of the pipe. For example, if we wanted to take the mtcars data and filter to cars with six cylinders, we can write the code in at least four different ways: filter(mtcars, cyl == 6) mtcars %&gt;% filter(cyl == 6) mtcars %&gt;% filter(., cyl == 6) mtcars %&gt;% filter(.data = ., cyl == 6) Each of these lines of code will produce the same output since the argument that takes the data is in the first spot in filter(). The first two are probably familiar to those who have worked with the tidyverse. The third option functions the same way as the second one but is explicit that mtcars goes into the first argument, and the fourth option indicates that mtcars is going into the named argument of .data. Here we’re telling R to take what’s on the left side of the pipe (mtcars) and pipe it into the spot with the dot (.)—the first argument. In functions that are not part of the tidyverse, the data argument may not be in the first spot. For example, in svyttest(), the data argument is in the second spot, which means we need to place the dot (.) in the second spot and not the first. For example: svydata_des %&gt;% svyttest(x ~ y, .) By default, the pipe places the left-hand object in the first argument spot. Placing the dot (.) in the second argument spot indicates that the survey design object svydata_des should be used in the second argument and not the first. Alternatively, named arguments could be used to place the dot first, as in the following: svydata_des %&gt;% svyttest(design = ., x ~ y) 6.2 Comparison of Proportions and Means We use t-tests to compare two proportions or means. T-tests allow us to determine if one proportion or mean is statistically different from the other. They are commonly used to determine if a single estimate differs from a known value (e.g., 0 or 50%) or to compare two group means (e.g., North versus South). Comparing a single estimate to a known value is called a one sample t-test, and we can set up the hypothesis test as follows: \\(H_0: \\mu = 0\\) where \\(\\mu\\) is the mean outcome and \\(0\\) is the value we are comparing it to \\(H_A: \\mu \\neq 0\\) For comparing two estimates, this is called a two-sample t-test and we can set up the hypothesis test as follows: \\(H_0: \\mu_1 = \\mu_2\\) where \\(\\mu_i\\) is the mean outcome for group \\(i\\) \\(H_A: \\mu_1 \\neq \\mu_2\\) Two sample t-tests can also be paired or unpaired. If the data come from two different populations (e.g., North versus South), the t-test run will be an unpaired or independent samples t-test. Paired t-tests occur when the data come from the same population. This is commonly seen with data from the same population in two different time periods (e.g., before and after an intervention). The difference between t-tests with non-survey data and survey data is based on the underlying variance estimation difference. Chapter 3 provides a detailed overview of the math behind the mean and sampling error calculations for various sample designs. The functions in the {survey} package will account for these nuances, provided the design object is correctly defined. 6.2.1 Syntax When we do not have survey data, we can use the t.test() function from the {stats} package. This function does not allow for weights or the variance structure that need to be accounted for with survey data. Therefore, we need to use the svyttest() function from {survey} when using survey data. Many of the arguments are the same between the two functions, but there are two key differences: We need to use the survey design object instead of the original data frame We can only use a formula and not separate x and y data Here is the syntax for the svyttest() function: svyttest(formula, design, na.rm = FALSE, level = 0.95, ...) Notice that the first argument here is the formula and not the design. This means we must use the dot (.) if we pipe in the survey design object (as described at the beginning of this chapter). The formula argument can take on a couple of different forms depending on what we are measuring. Here are a few common scenarios: One-sample t-test: Comparison to 0: var ~ 0, where var is the measure of interest, and 0 is the value we compare it to. For example, we could test if the proportion of the population that has blue eyes is different from 0. Comparison to a different value: I(var - value) ~ 0, where var is the measure of interest and value is what we are comparing to. We need to use the I() function to tell the program to calculate the difference between the variable and the comparison value before testing. For example, we could test if the proportion of the population that has blue eyes is different from 25% by using I(var - 0.25) ~ 0. Two-sample t-test: Unpaired: 2 level grouping variable: var ~ groupVar, where var is the measure of interest and groupVar is a variable with two categories. For example, we could test if the proportion of the population that has blue eyes is different for children aged 5-10 years old compared to children under 5 years old. 3+ level grouping variable: var ~ I(groupVar == level), where var is the measure of interest, groupVar is the categorical variable, and level is the category level to isolate. Again, we need to use the I() function to tell the program to isolate the category before doing the comparison across groups. For example, we could test if the test scores in one classroom differed from all other classrooms. Paired: I(var_1 - var_2) ~ 0, where var_1 is the first variable of interest and var_2 is the second variable of interest. We again will have to use the I() function to have the program calculate the difference between the two variables before comparing it against 0. For example, we could test if test scores on a subject differed between the start and the end of a course. In R, I() is a special function that isolates its content from R’s parsing code. It is often referred to as the “as-is” operator. When we wrap an expression inside I(), we can include variables or expressions that should be taken “as-is”, without any variable name expansion or other interpretation, allowing the standard R operators to work as they would if we used them outside of a formula. Additionally, the na.rm argument defaults to FALSE, which means if any data is missing, the t-test will not compute. Throughout this chapter we will always set na.rm = TRUE, but before analyzing the survey data, review the notes provided in Chapter 4 to better understand how to handle missing data. Finally, the level argument is \\(1-\\alpha\\), or the amount of type 1 error. The default is \\(0.95\\). Let’s walk through a few examples using the ANES and RECS data. 6.2.2 Examples Example 1: One-sample t-test RECS asks respondents to indicate what temperature they set their house to during the summer at night. In our data, we’ve called this variable SummerTempNight. If we want to see if the U.S. household sets their temperature at a value different from 68\\(^\\circ\\)F, we could set up the hypothesis as follows: \\(H_0: \\mu = 68\\) where \\(\\mu\\) is the average temperature U.S. households set their thermostat to in the summer at night \\(H_A: \\mu \\neq 68\\) To conduct this in R, we use svyttest() with I() function in the formula argument: ttest_ex1 &lt;- recs_des %&gt;% svyttest( formula = I(SummerTempNight - 68) ~ 0, design = ., na.rm = TRUE ) ttest_ex1 ## ## Design-based one-sample t-test ## ## data: I(SummerTempNight - 68) ~ 0 ## t = 41, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 3.425 3.773 ## sample estimates: ## mean ## 3.599 To pull out specific output, we can use R’s built-in $ operator. For instance, to obtain the estimate \\(\\mu - 68\\), we run ttest_ex1$estimate. If we want the average, we take our t-test estimate and add it to 68: ttest_ex1$estimate + 68 ## mean ## 71.6 Or, we can do the following: recs_des %&gt;% summarize(mu = survey_mean(SummerTempNight, na.rm = TRUE)) ## # A tibble: 1 × 2 ## mu mu_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 71.6 0.0878 The result is the same in both methods, so we see that the average temperature U.S. households set their thermostat to in the summer at night is 71.6\\(F. Looking at the output from `svyttest()`, the t-statistic is 41, and the p-value is 8.67\\times 10^{-62}, indicating that the average is statistically different from 68\\)$F at an \\(\\alpha\\) level of \\(0.05\\). Example 2: One-sample t-test ANES asked respondents if they voted for president in the 2020 election. In our data, we call this variable VotedPres2020. Let’s look at the proportion of the U.S. voting-eligible population that voted for president in 2020 using the survey_prop() function we learned in Chapter 5. voteprop &lt;- anes_des %&gt;% group_by(VotedPres2020) %&gt;% summarize(p = survey_prop()) voteprop ## # A tibble: 3 × 3 ## VotedPres2020 p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Yes 0.772 0.00757 ## 2 No 0.227 0.00763 ## 3 &lt;NA&gt; 0.00113 0.000464 Based on this, 77.2% of the U.S. voting-eligible population voted for president in 2020. If we wanted to know how this compares to another country, we could use svyttest(). For example, if we know that the voter turnout in Germany in the 2017 general election was 76.2%, we could set up our hypothesis as follows: \\(H_0: p = 0.762\\) where \\(p\\) is the proportion of the U.S. voting-eligible population that voted for president in 2020 \\(H_A: p \\neq 0.762\\) To conduct this in R, we use the svyttest() function. Note that because VotedPres2020 is a factor, we need to specify our desired level within I() before nesting it in another I() to conduct the t-test. In this case, we want to isolate those who voted for president in 2020. ttest_ex2 &lt;- anes_des %&gt;% svyttest(formula = I(I(VotedPres2020 == &quot;Yes&quot;) - 0.762) ~ 0, design = ., na.rm = TRUE) ttest_ex2 ## ## Design-based one-sample t-test ## ## data: I(I(VotedPres2020 == &quot;Yes&quot;) - 0.762) ~ 0 ## t = 1.4, df = 50, p-value = 0.2 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## -0.004729 0.025899 ## sample estimates: ## mean ## 0.01058 The output from the svyttest() function can be a bit hard to read. Using the {broom} package from tidymodels, a collection of packages for modeling using the tidyverse principles, we can clean up the output into a tibble to more easily understand what the test is telling us. broom::tidy(ttest_ex2) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 0.0106 1.39 0.171 50 -0.00473 0.0259 Design-based … ## # ℹ 1 more variable: alternative &lt;chr&gt; The estimate differs from example one in that the estimate is not displaying \\(\\mu - 0.762\\) but rather \\(\\mu\\), or the difference between the U.S. proportion and the German proportion we are comparing to. We can see that there is a difference of 360 percentage points. Additionally, the t-statistic value in the statistic column is 41, and the p-value is 8.67^{-62}. These results indicate that the U.S. and Germany have similar voter turnout. Example 3: Unpaired two-sample t-test Two additional variables in the RECS data are the electric bill cost (DOLLAREL) and whether the house used AC or not (ACUsed). If we want to know if the U.S. households that used AC had higher electrical bills compared to those that did not, we could set up the hypothesis as follows: \\(H_0: \\mu_{AC} = \\mu_{noAC}\\) where \\(\\mu_{AC}\\) is the electrical bill cost for U.S. households that used AC and \\(\\mu_{noAC}\\) is the electrical bill cost for U.S. households that did not use AC \\(H_A: \\mu_{AC} \\neq \\mu_{noAC}\\) Let’s take a quick look at the data to see the format the data are in: recs_des %&gt;% group_by(ACUsed) %&gt;% summarize(mean = survey_mean(DOLLAREL, na.rm = TRUE)) ## # A tibble: 2 × 3 ## ACUsed mean mean_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 972. 25.8 ## 2 TRUE 1435. 15.8 To conduct this in R, we use svyttest(): ttest_ex3 &lt;- recs_des %&gt;% svyttest(formula = DOLLAREL ~ ACUsed, design = ., na.rm = TRUE) broom::tidy(ttest_ex3) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 463. 14.8 3.08e-26 94 401. 525. Design-based… ## # ℹ 1 more variable: alternative &lt;chr&gt; The results indicate that the difference in electrical bills for those that used AC and those that did not is, on average, $462.87. The difference appears to be statistically significant as the t-statistic is 14.772 and the p-value is 3.08^{-26}. Households that used AC spent on average $462.90 more in 2015. Example 4: Paired two-sample t-test To conduct a paired t-test that looks at differences at two time points, we use the same I() notation we’ve been using. For example, let’s say we want to test whether the temperature that U.S. households set their thermostat to differs depending on the season (comparing summer and winter temperatures). We could set up the hypothesis as follows: \\(H_0: \\mu_{summer} = \\mu_{winter}\\) where \\(\\mu_{summer}\\) is the temperature that U.S. households set their thermostat to during summer nights, and \\(\\mu_{winter}\\) is the temperature that U.S. households set their thermostat to during winter nights \\(H_A: \\mu_{summer} \\neq \\mu_{winter}\\) To conduct this in R, we use svyttest() and I(): ttest_ex4 &lt;- recs_des %&gt;% svyttest( design = ., formula = I(SummerTempNight - WinterTempNight) ~ 0, na.rm = TRUE ) broom::tidy(ttest_ex4) ## # A tibble: 1 × 8 ## estimate statistic p.value parameter conf.low conf.high method ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;chr&gt; ## 1 3.21 29.1 8.83e-49 94 3.00 3.43 Design-based… ## # ℹ 1 more variable: alternative &lt;chr&gt; U.S. households set their thermostat on average 3.2\\(^\\circ\\)F warmer in summer nights than winter nights, which is statistically significant (t = 29.1, p-value = 8.83^{-49}). 6.3 Chi-Square Tests Chi-square tests (\\(\\chi^2\\)) allow us to examine multiple proportions using a goodness-of-fit test, a test of independence, or a test of homogeneity. All three of these tests have the same \\(\\chi^2\\) distributions but with slightly different underlying assumptions. First, goodness-of-fit tests are used when comparing observed data to expected data. For example, this could be used to determine if respondent demographics (the observed data) match known population information (the expected data). In this case, we can set up the hypothesis test as follows: \\(H_0: p_1 = \\pi_1, ~ p_2 = \\pi_2, ~ ..., ~ p_k = \\pi_k\\) where \\(p_i\\) is the observed proportion for category \\(i\\), \\(\\pi_i\\) is expected proportion for category \\(i\\), and \\(k\\) is the number of categories \\(H_A:\\) at least one level of \\(p_i\\) does not match \\(\\pi_i\\) Second, tests of independence are used when comparing two types of observed data to see if there is a relationship. For example, this could be used to determine if the proportion of respondents who voted for each political party in the presidential election matches the proportion of respondents who voted for each political party in a local election. In this case, we can set up the hypothesis test as follows: \\(H_0:\\) The two variables/factors are independent \\(H_A:\\) The two variables/factors are not independent Third, tests of homogeneity are used to compare two distributions to see if they match. For example, this could be used to determine if the highest education achieved is the same for both men and women. In this case, we can set up the hypothesis test as follows: \\(H_0: p_{1a} = p_{1b}, ~ p_{2a} = p_{2b}, ~ ..., ~ p_{ka} = p_{kb}\\) where \\(p_{ia}\\) is the observed proportion of category \\(i\\) for subgroup \\(a\\), \\(p_{ib}\\) is the observed proportion of category \\(i\\) for subgroup \\(a\\) and \\(k\\) is the number of categories \\(H_A:\\) at least one category of \\(p_{ia}\\) does not match \\(p_{ib}\\) As with t-tests, the difference between using \\(\\chi^2\\) tests with non-survey data and survey data is based on the underlying variance estimation. The functions in the {survey} package will account for these nuances, provided the design object is correctly defined. For basic variance estimation formulas for different survey design types, refer to Chapter 3. 6.3.1 Syntax When we do not have survey data, we may be able to use the chisq.test() function. However, this function does not allow for weights or the variance structure to be accounted for with survey data. Therefore, when using survey data, we need to use one of two functions: svygofchisq(): For goodness of fit tests svychisq(): For tests of independence and homogeneity The non-survey data function of chisq.test() requires either a single set of counts and given proportions (for goodness of fit tests) or two sets of counts for tests of independence and homogeneity. The functions we use with survey data require respondent-level data and formulas instead of counts. This ensures that the variances are correctly calculated. First, the function for the goodness of fit tests is svygofchisq(): svygofchisq(formula, p, design, na.rm = TRUE, ...) In this function, the first argument is the formula, the second is p, which is the expected proportions, and the third is the design. Therefore, we again must use the dot (.) notation if we pipe in the survey design object or explicitly name the arguments (as described at the beginning of this chapter). For the goodness of fit tests, the formula will be a single variable formula = ~var as we compare the observed data from this variable to the expected data. The expected probabilities are then entered in the p argument and need to be a vector of the same length as the number of categories in the variable. For example, if we want to know if the proportion of males and females match a distribution of 30/70, then the sex variable (with two categories) would be used formula = ~SEX, and the proportions would be included as p = c(.3, .7). It is important to note that the variable entered into the formula should be formatted as either a factor or a character. The examples below provide more detail and tips on how to make sure the levels match up correctly. The function for tests of independence and homogeneity (svychisq()) is similar to the goodness of fit function in that the formula argument is first. However, instead of an argument for the expected proportions, the svychisq() function has an argument to select the statistic(s) used for the test: svychisq( formula, design, statistic = c(&quot;F&quot;, &quot;Chisq&quot;, &quot;Wald&quot;, &quot;adjWald&quot;, &quot;lincom&quot;, &quot;saddlepoint&quot;), na.rm = TRUE, ... ) There are six statistics that are accepted in this formula. For tests of homogeneity (when comparing cross-tabulations), the F or Chisq statistics should be used.17 The F statistic is the default and uses the Rao-Scott second-order correction. This correction is designed to assist with complicated sampling designs (i.e., those other than a simple random sample) (CITE)18. The Chisq statistic is an adjusted version of the Pearson \\(\\chi^2\\) statistic. The version of this statistic in the svychisq() function compares the design effect estimate from the provided survey data to what the \\(\\chi^2\\) distribution would have been if the data came from a simple random sampling. For tests of independence, the Wald and adjWald are recommended as they provide a better adjustment for variable comparisons (Lumley (2010)). If the data has a small number of primary sampling units (PSUs) compared to the degrees of freedom, then the adjWald statistic should be used to account for this. The lincom and saddlepoint statistics are available for more complicated data structures. The formula argument will always be one-sided, unlike the svyttest() function. The two variables of interest should be included with a plus sign: formula = ~ var_1 + var_2. As with the svygofchisq() function, the variables entered into the formula should be formatted as either a factor or a character. Additionally, as with the t-test function, both svygofchisq() and svychisq() have the na.rm argument. This argument defaults to FALSE; however, unlike the t-test function, if any data is missing, the \\(\\chi^2\\) tests will assume that NA is a category and will include it in the calculation. Throughout this chapter, we will always set na.rm = TRUE, but before analyzing the survey data, review the notes provided in Chapter 4 to better understand how to handle missing data. 6.3.2 Examples Let’s walk through a few examples using the ANES data. Example 1: Goodness of Fit Test ANES asked respondents about their highest education level. Based on the data from the 2020 American Community Survey (ACS) 5-year estimates19, the education distribution of those 18+ in the U.S. is as follows: - 11% had less than High School degree - 27% had a High School degree - 29% had some college or associate’s degree - 33% had a bachelor’s degree or higher If we want to see if the weighted distribution from the ANES 2020 data matches this distribution, we could set up the hypothesis as follows: \\(H_0: p_1 = 0.11, ~ p_2 = 0.27, ~ p_3 = 0.29, ~ p_4 = 0.33\\) \\(H_A:\\) at least one of the education levels does not match between the ANES and the ACS To conduct this in R, let’s first look at the education variable (Education) we have on the ANES data. Using the survey_mean() function discussed in Chapter 5, we can see the education levels and estimated proportions. anes_des %&gt;% group_by(Education) %&gt;% filter(!is.na(Education)) %&gt;% summarize(p = survey_mean()) ## # A tibble: 5 × 3 ## Education p p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than HS 0.0805 0.00568 ## 2 High school 0.277 0.0102 ## 3 Post HS 0.290 0.00713 ## 4 Bachelor&#39;s 0.226 0.00633 ## 5 Graduate 0.126 0.00499 Based on this output, we can see that we have different levels than the ACS data provides. Specifically, the education data from ANES has two levels for Bachelor’s Degree or Higher (Bachelor’s and Graduate), so these two categories need to be collapsed into a single category to match the ACS data. For this, we can use the {forcats} package from the tidyverse. The package’s fct_collapse() function creates a new variable. Then we will use the svygofchisq() function to compare the ANES data to the ACS data: anes_des_educ &lt;- anes_des %&gt;% mutate(Education2 = fct_collapse(Education, &quot;Bachelor or Higher&quot; = c(&quot;Bachelor&#39;s&quot;, &quot;Graduate&quot;))) chi_ex1 &lt;- anes_des_educ %&gt;% svygofchisq( formula = ~ Education2, p = c(0.11, 0.27, 0.29, 0.33), design = ., na.rm = TRUE ) chi_ex1 ## ## Design-based chi-squared test for given probabilities ## ## data: ~Education2 ## X-squared = 2177472, scale = 1.1e+05, df = 2.3e+00, p-value = ## 9e-05 The output from the svygofchisq() indicates that at least one proportion from ANES does not match the ACS data ( \\(\\chi^2=\\) 2.1775^{6}; p-value= 8.74^{-5} ). To get a better idea of the differences, we can use the expected output along with survey_mean() to create a comparison table: ex1_expected &lt;- tibble(ExpectedCount = chi_ex1$expected) %&gt;% mutate( Education = names(chi_ex1$expected), Education = str_sub(Education, 11, nchar(Education)), ExpectedProb = ExpectedCount / sum(ExpectedCount) ) %&gt;% select(Education, Expected = ExpectedProb) ex1_observed &lt;- anes_des_educ %&gt;% filter(!is.na(Education2)) %&gt;% group_by(Education2) %&gt;% summarize(Observed = survey_mean(vartype = &quot;ci&quot;)) %&gt;% rename(Education = Education2) ex1_table &lt;- ex1_expected %&gt;% left_join(ex1_observed, by = &quot;Education&quot;) %&gt;% mutate(Education = factor( Education, levels = c(&quot;Less than HS&quot;, &quot;High school&quot;, &quot;Post HS&quot;, &quot;Bachelor or Higher&quot;) )) ex1_table ## # A tibble: 4 × 5 ## Education Expected Observed Observed_low Observed_upp ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than HS 0.11 0.0805 0.0691 0.0919 ## 2 High school 0.27 0.277 0.257 0.298 ## 3 Post HS 0.29 0.290 0.276 0.305 ## 4 Bachelor or Higher 0.33 0.352 0.337 0.367 This output includes our expected proportions from the ACS that we provided the svygofchisq() function along with the output of the observed proportions and their confidence intervals. From this table, we can see that the “High school” and “Post HS” categories have nearly identical proportions but that the other two categories are slightly different. Looking at the confidence intervals, we can see that the ANES data skews to include fewer people in the “Less than HS” category and more people in the “Bachelor or Higher” category. This may be easier to see in graphical form: ex1_table %&gt;% pivot_longer( cols = c(&quot;Expected&quot;, &quot;Observed&quot;), names_to = &quot;Names&quot;, values_to = &quot;Proportion&quot; ) %&gt;% mutate( Observed_low = case_when(Names == &quot;Observed&quot; ~ Observed_low), Observed_upp = case_when(Names == &quot;Observed&quot; ~ Observed_upp) ) %&gt;% ggplot(aes(x = Education, y = Proportion, color = Names)) + geom_point(alpha = 0.5) + geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) + theme_bw() + scale_color_manual(name = &quot;Type&quot;, values = book_colors[c(4,1)]) + theme(legend.position = &quot;bottom&quot;) FIGURE 6.1: Expected and observed proportions of education, showing the confidence intervals for the expected proportions and whether the observed proportions lie within them. The x-axis has labels ‘Less than HS’, ‘High school’, ‘Post HS’, and ‘Bachelor or Higher’. The only ones where expected proportion is outside of the intervals is ‘Less than HS’ and ‘Bachelor or Higher’. Example 2: Test of Independence ANES asked respondents two questions about trust: How often can you trust the federal government to do what is right? How often can you trust other people? If we want to see if the distributions of these two questions are similar or not, we can conduct a test of independence. Here is how the hypothesis could be set up: \\(H_0:\\) People’s trust in the federal government and their trust in other people are independent (i.e., not related) \\(H_A:\\) People’s trust in the federal government and their trust in other people are not independent (i.e., they are related) To conduct this in R, we use the svychisq() function to compare the two variables: chi_ex2 &lt;- anes_des %&gt;% svychisq( formula = ~ TrustGovernment + TrustPeople, design = ., statistic = &quot;Wald&quot;, na.rm = TRUE ) chi_ex2 ## ## Design-based Wald test of association ## ## data: NextMethod() ## F = 21, ndf = 16, ddf = 51, p-value &lt;2e-16 The output from svychisq() indicates that the distribution of people’s trust in the federal government and their trust in other people are not independent, meaning that they are related. Let’s output the distributions in a table to see the relationship. The observed output from the test provides a cross-tabulation of the counts for each category: chi_ex2$observed ## TrustPeople ## TrustGovernment Always Most of the time About half the time ## Always 16.470 25.009 31.848 ## Most of the time 11.020 539.377 196.258 ## About half the time 11.772 934.858 861.971 ## Some of the time 17.007 1353.779 839.863 ## Never 3.174 236.785 174.272 ## TrustPeople ## TrustGovernment Some of the time Never ## Always 36.854 5.523 ## Most of the time 206.556 27.184 ## About half the time 428.871 65.024 ## Some of the time 932.628 89.596 ## Never 217.994 189.307 However, as researchers, we often want to know about the proportions and not just the respondent counts from the survey. There are a couple of different ways that we can do this. The first is using the counts from chi_ex2$observed to calculate the proportion. We can then pivot the table to create a cross-tabulation similar to the counts table above. Adding group_by() to the code means that we are obtaining the proportions within each level of that variable. In this case, we are looking at the distribution of TrustGovernment for each level of TrustPeople. chi_ex2$observed %&gt;% as_tibble() %&gt;% group_by(TrustPeople) %&gt;% mutate(prop = round(n / sum(n), 3)) %&gt;% select(-n) %&gt;% pivot_wider(names_from = TrustPeople, values_from = prop) %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% tab_stubhead(label = &quot;Trust in Government&quot;) %&gt;% tab_spanner(label = &quot;Trust in People&quot;, columns = everything()) %&gt;% cols_label(`Most of the time` = md(&quot;Most of&lt;br /&gt;the time&quot;), `About half the time` = md(&quot;About half&lt;br /&gt;the time&quot;), `Some of the time` = md(&quot;Some of&lt;br /&gt;the time&quot;)) #edxahdlkim table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #edxahdlkim thead, #edxahdlkim tbody, #edxahdlkim tfoot, #edxahdlkim tr, #edxahdlkim td, #edxahdlkim th { border-style: none; } #edxahdlkim p { margin: 0; padding: 0; } #edxahdlkim .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #edxahdlkim .gt_caption { padding-top: 4px; padding-bottom: 4px; } #edxahdlkim .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #edxahdlkim .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #edxahdlkim .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #edxahdlkim .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #edxahdlkim .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #edxahdlkim .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #edxahdlkim .gt_column_spanner_outer:first-child { padding-left: 0; } #edxahdlkim .gt_column_spanner_outer:last-child { padding-right: 0; } #edxahdlkim .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #edxahdlkim .gt_spanner_row { border-bottom-style: hidden; } #edxahdlkim .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #edxahdlkim .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #edxahdlkim .gt_from_md > :first-child { margin-top: 0; } #edxahdlkim .gt_from_md > :last-child { margin-bottom: 0; } #edxahdlkim .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #edxahdlkim .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #edxahdlkim .gt_row_group_first td { border-top-width: 2px; } #edxahdlkim .gt_row_group_first th { border-top-width: 2px; } #edxahdlkim .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #edxahdlkim .gt_first_summary_row.thick { border-top-width: 2px; } #edxahdlkim .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #edxahdlkim .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #edxahdlkim .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #edxahdlkim .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #edxahdlkim .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #edxahdlkim .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #edxahdlkim .gt_left { text-align: left; } #edxahdlkim .gt_center { text-align: center; } #edxahdlkim .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #edxahdlkim .gt_font_normal { font-weight: normal; } #edxahdlkim .gt_font_bold { font-weight: bold; } #edxahdlkim .gt_font_italic { font-style: italic; } #edxahdlkim .gt_super { font-size: 65%; } #edxahdlkim .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #edxahdlkim .gt_asterisk { font-size: 100%; vertical-align: 0; } #edxahdlkim .gt_indent_1 { text-indent: 5px; } #edxahdlkim .gt_indent_2 { text-indent: 10px; } #edxahdlkim .gt_indent_3 { text-indent: 15px; } #edxahdlkim .gt_indent_4 { text-indent: 20px; } #edxahdlkim .gt_indent_5 { text-indent: 25px; } Trust in Government Trust in People Always Most ofthe time About halfthe time Some ofthe time Never Always 0.277 0.008 0.015 0.020 0.015 Most of the time 0.185 0.175 0.093 0.113 0.072 About half the time 0.198 0.303 0.410 0.235 0.173 Some of the time 0.286 0.438 0.399 0.512 0.238 Never 0.053 0.077 0.083 0.120 0.503 The second option is to use group_by() and survey_mean() functions to calculate the proportions from the ANES design object. A reminder that with more than one variable listed in the group_by() statement, the proportions are within the first variable listed. As mentioned above, we are looking at the distribution of TrustGovernment for each level of TrustPeople. chi_ex2_obs &lt;- anes_des %&gt;% filter(!is.na(TrustPeople), !is.na(TrustGovernment)) %&gt;% group_by(TrustPeople, TrustGovernment) %&gt;% summarize(Observed = round(survey_mean(vartype = &quot;ci&quot;), 3)) chi_ex2_obs %&gt;% mutate(prop = paste0(Observed, &quot; (&quot;, Observed_low, &quot;, &quot;, Observed_upp, &quot;)&quot;)) %&gt;% select(TrustGovernment, TrustPeople, prop) %&gt;% pivot_wider(names_from = TrustPeople, values_from = prop) %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% tab_stubhead(label = &quot;Trust in Government&quot;) %&gt;% tab_spanner(label = &quot;Trust in People&quot;, columns = everything()) %&gt;% tab_options(page.orientation = &quot;landscape&quot;) #jslvphoojc table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #jslvphoojc thead, #jslvphoojc tbody, #jslvphoojc tfoot, #jslvphoojc tr, #jslvphoojc td, #jslvphoojc th { border-style: none; } #jslvphoojc p { margin: 0; padding: 0; } #jslvphoojc .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #jslvphoojc .gt_caption { padding-top: 4px; padding-bottom: 4px; } #jslvphoojc .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #jslvphoojc .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #jslvphoojc .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #jslvphoojc .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #jslvphoojc .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #jslvphoojc .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #jslvphoojc .gt_column_spanner_outer:first-child { padding-left: 0; } #jslvphoojc .gt_column_spanner_outer:last-child { padding-right: 0; } #jslvphoojc .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #jslvphoojc .gt_spanner_row { border-bottom-style: hidden; } #jslvphoojc .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #jslvphoojc .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #jslvphoojc .gt_from_md > :first-child { margin-top: 0; } #jslvphoojc .gt_from_md > :last-child { margin-bottom: 0; } #jslvphoojc .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #jslvphoojc .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #jslvphoojc .gt_row_group_first td { border-top-width: 2px; } #jslvphoojc .gt_row_group_first th { border-top-width: 2px; } #jslvphoojc .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #jslvphoojc .gt_first_summary_row.thick { border-top-width: 2px; } #jslvphoojc .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #jslvphoojc .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #jslvphoojc .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #jslvphoojc .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #jslvphoojc .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #jslvphoojc .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #jslvphoojc .gt_left { text-align: left; } #jslvphoojc .gt_center { text-align: center; } #jslvphoojc .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #jslvphoojc .gt_font_normal { font-weight: normal; } #jslvphoojc .gt_font_bold { font-weight: bold; } #jslvphoojc .gt_font_italic { font-style: italic; } #jslvphoojc .gt_super { font-size: 65%; } #jslvphoojc .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #jslvphoojc .gt_asterisk { font-size: 100%; vertical-align: 0; } #jslvphoojc .gt_indent_1 { text-indent: 5px; } #jslvphoojc .gt_indent_2 { text-indent: 10px; } #jslvphoojc .gt_indent_3 { text-indent: 15px; } #jslvphoojc .gt_indent_4 { text-indent: 20px; } #jslvphoojc .gt_indent_5 { text-indent: 25px; } Trust in Government Trust in People Always Most of the time About half the time Some of the time Never Always 0.277 (0.11, 0.444) 0.008 (0.004, 0.012) 0.015 (0.006, 0.024) 0.02 (0.008, 0.033) 0.015 (0, 0.029) Most of the time 0.185 (-0.009, 0.38) 0.175 (0.157, 0.192) 0.093 (0.078, 0.109) 0.113 (0.085, 0.141) 0.072 (0.021, 0.123) About half the time 0.198 (0.046, 0.35) 0.303 (0.281, 0.324) 0.41 (0.378, 0.441) 0.235 (0.2, 0.271) 0.173 (0.099, 0.246) Some of the time 0.286 (0.069, 0.503) 0.438 (0.415, 0.462) 0.399 (0.365, 0.433) 0.512 (0.481, 0.543) 0.238 (0.178, 0.298) Never 0.053 (-0.01, 0.117) 0.077 (0.064, 0.089) 0.083 (0.063, 0.103) 0.12 (0.097, 0.142) 0.503 (0.422, 0.583) Both methods produce the same output as the svychisq() function does account for the survey design. However, calculating the proportions directly from the design object means that we can also obtain the variance information. In this case, the table output displays the survey estimate followed by the confidence intervals. Based on the output, we can see that of those that never trust people, 50.3% also never trust the government, while the proportions of never trusting the government are much lower for each of the other levels of trusting people. We may find it easier to look at these proportions graphically. We can use ggplot() and facets to provide an overview: chi_ex2_obs %&gt;% mutate(TrustPeople=str_c(&quot;Trust in People:\\n&quot;, TrustPeople)) %&gt;% ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) + facet_wrap( ~ TrustPeople, ncol = 5) + geom_point() + geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) + ylab(&quot;Proportion&quot;) + xlab(&quot;&quot;) + theme_bw() + scale_color_manual(name=&quot;Trust in Government&quot;, values=book_colors) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = &quot;bottom&quot;) Example 3: Test of Homogeneity Researchers and politicians often look at specific demographics each election cycle to understand how each group is leaning or voting towards candidates. The ANES data is post-election, but we can still see if there are differences in how specific demographic groups voted. If we want to see if there is a difference in how each age group voted for the 2020 candidates, this would be a test of homogeneity, and the hypothesis could be set up as follows: \\[\\begin{align*} H_0: p_{1_{Biden}} &amp;= p_{1_{Trump}} = p_{1_{Other}},\\\\ p_{2_{Biden}} &amp;= p_{2_{Trump}} = p_{2_{Other}},\\\\ p_{3_{Biden}} &amp;= p_{3_{Trump}} = p_{3_{Other}},\\\\ p_{4_{Biden}} &amp;= p_{4_{Trump}} = p_{4_{Other}},\\\\ p_{5_{Biden}} &amp;= p_{5_{Trump}} = p_{5_{Other}},\\\\ p_{6_{Biden}} &amp;= p_{6_{Trump}} = p_{6_{Other}} \\end{align*}\\] where \\(p_{i_{Biden}}\\) is the observed proportion of each age group (\\(i\\)) that voted for Biden, \\(p_{i_{Trump}}\\) is the observed proportion of each age group (\\(i\\)) that voted for Trump, and \\(p_{i_{Other}}\\) is the observed proportion of each age group (\\(i\\)) that voted for another candidate \\(H_A:\\) at least one category of \\(p_{i_{Biden}}\\) does not match \\(p_{i_{Trump}}\\) or \\(p_{i_{Other}}\\) To conduct this in R, we use the svychisq() function to compare the two variables: chi_ex3 &lt;- anes_des %&gt;% filter(VotedPres2020 == &quot;Yes&quot; &amp; !is.na(VotedPres2020_selection) &amp; !is.na(AgeGroup)) %&gt;% svychisq( formula = ~ AgeGroup + VotedPres2020_selection, design = ., statistic = &quot;Chisq&quot;, na.rm = TRUE ) chi_ex3 ## ## Pearson&#39;s X^2: Rao &amp; Scott adjustment ## ## data: NextMethod() ## X-squared = 169, df = 10, p-value &lt;2e-16 The output from svychisq() indicates a difference in how each age group voted in the 2020 election. To get a better idea of the different distributions, let’s output proportions to see the relationship. As we learned in Example 2 above, we can use chi_ex3$observed, or if we want to get the variance information (which is crucial with survey data), we can use survey_mean(). Remember, when we have two variables in group_by(), we obtain the proportions within each level of the variable listed. In this case, we are looking at the distribution of AgeGroup for each level of VotedPres2020_selection. chi_ex3_obs &lt;- anes_des %&gt;% filter(VotedPres2020 == &quot;Yes&quot; &amp; !is.na(VotedPres2020_selection) &amp; !is.na(AgeGroup)) %&gt;% group_by(VotedPres2020_selection, AgeGroup) %&gt;% summarize(Observed = round(survey_mean(vartype = &quot;ci&quot;), 3)) chi_ex3_obs %&gt;% mutate(prop = paste0(Observed, &quot; (&quot;, Observed_low, &quot;, &quot;, Observed_upp, &quot;)&quot;)) %&gt;% select(AgeGroup, VotedPres2020_selection, prop) %&gt;% pivot_wider(names_from = VotedPres2020_selection, values_from = prop) %&gt;% gt(rowname_col = &quot;AgeGroup&quot;) %&gt;% tab_stubhead(label = &quot;Age Group&quot;) #uphlolqabb table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #uphlolqabb thead, #uphlolqabb tbody, #uphlolqabb tfoot, #uphlolqabb tr, #uphlolqabb td, #uphlolqabb th { border-style: none; } #uphlolqabb p { margin: 0; padding: 0; } #uphlolqabb .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #uphlolqabb .gt_caption { padding-top: 4px; padding-bottom: 4px; } #uphlolqabb .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #uphlolqabb .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #uphlolqabb .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #uphlolqabb .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #uphlolqabb .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #uphlolqabb .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #uphlolqabb .gt_column_spanner_outer:first-child { padding-left: 0; } #uphlolqabb .gt_column_spanner_outer:last-child { padding-right: 0; } #uphlolqabb .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #uphlolqabb .gt_spanner_row { border-bottom-style: hidden; } #uphlolqabb .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #uphlolqabb .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #uphlolqabb .gt_from_md > :first-child { margin-top: 0; } #uphlolqabb .gt_from_md > :last-child { margin-bottom: 0; } #uphlolqabb .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #uphlolqabb .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #uphlolqabb .gt_row_group_first td { border-top-width: 2px; } #uphlolqabb .gt_row_group_first th { border-top-width: 2px; } #uphlolqabb .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #uphlolqabb .gt_first_summary_row.thick { border-top-width: 2px; } #uphlolqabb .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #uphlolqabb .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #uphlolqabb .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #uphlolqabb .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #uphlolqabb .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #uphlolqabb .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #uphlolqabb .gt_left { text-align: left; } #uphlolqabb .gt_center { text-align: center; } #uphlolqabb .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #uphlolqabb .gt_font_normal { font-weight: normal; } #uphlolqabb .gt_font_bold { font-weight: bold; } #uphlolqabb .gt_font_italic { font-style: italic; } #uphlolqabb .gt_super { font-size: 65%; } #uphlolqabb .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #uphlolqabb .gt_asterisk { font-size: 100%; vertical-align: 0; } #uphlolqabb .gt_indent_1 { text-indent: 5px; } #uphlolqabb .gt_indent_2 { text-indent: 10px; } #uphlolqabb .gt_indent_3 { text-indent: 15px; } #uphlolqabb .gt_indent_4 { text-indent: 20px; } #uphlolqabb .gt_indent_5 { text-indent: 25px; } Age Group Biden Trump Other 18-29 0.204 (0.177, 0.231) 0.114 (0.095, 0.133) 0.227 (0.15, 0.304) 30-39 0.169 (0.153, 0.185) 0.147 (0.123, 0.17) 0.306 (0.214, 0.398) 40-49 0.163 (0.146, 0.18) 0.157 (0.136, 0.178) 0.209 (0.128, 0.29) 50-59 0.154 (0.136, 0.173) 0.234 (0.207, 0.261) 0.107 (0.041, 0.172) 60-69 0.179 (0.16, 0.199) 0.192 (0.172, 0.213) 0.102 (0.025, 0.178) 70 or older 0.13 (0.118, 0.143) 0.156 (0.139, 0.174) 0.049 (0, 0.098) We can see that the age group distribution was younger for Biden and other candidates and older for Trump. For example, of those that voted for Biden, 20.4% were in the 18-29 age group, compared to only 11.4% of those that voted for Trump were in that age group. On the other side, 23.4% of those that voted for Trump were in the 50-59 age group compared to only 15.4% of those that voted for Biden. 6.4 Exercises Here are some exercises for practicing conducting t-tests using svyttest(): Using the RECS data, do more than 50% of U.S. households use AC (ACUsed)? ttest_solution1 &lt;- recs_des %&gt;% svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE) ttest_solution1 ## ## Design-based one-sample t-test ## ## data: ((ACUsed == TRUE) - 0.5) ~ 0 ## t = 45, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 0.3533 0.3861 ## sample estimates: ## mean ## 0.3697 Using the RECS data, does the average temperature that U.S. households set their thermostats to differ between the day and night in the winter (WinterTempDay and WinterTempNight)? ttest_solution2 &lt;- recs_des %&gt;% svyttest( design = ., formula = I(WinterTempDay - WinterTempNight) ~ 0, na.rm = TRUE ) ttest_solution2 ## ## Design-based one-sample t-test ## ## data: I(WinterTempDay - WinterTempNight) ~ 0 ## t = 31, df = 94, p-value &lt;2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 1.787 2.028 ## sample estimates: ## mean ## 1.907 Using the ANES data, does the average age (Age) of those who voted for Biden in 2020 (VotedPres2020_selection) differ from those that voted for another candidate? ttest_solution3 &lt;- anes_des %&gt;% svyttest( design = ., formula = Age ~ I(VotedPres2020_selection == &quot;Biden&quot;), na.rm = TRUE ) ttest_solution3 ## ## Design-based t-test ## ## data: Age ~ I(VotedPres2020_selection == &quot;Biden&quot;) ## t = -6, df = 50, p-value = 2e-07 ## alternative hypothesis: true difference in mean is not equal to 0 ## 95 percent confidence interval: ## -4.824 -2.395 ## sample estimates: ## difference in mean ## -3.61 Here are some exercises for practicing conducting chi-squared tests using svygofchisq() and svychisq(): If you wanted to determine if the political party affiliation differed for males and females, what test would you use? Goodness of fit test (svygofchisq()) Test of independence (svychisq()) Test of homogeneity (svychisq()) chisq_solution1 &lt;- &quot;c. Test of homogeneity (`svychisq()`)&quot; chisq_solution1 ## [1] &quot;c. Test of homogeneity (`svychisq()`)&quot; In the RECS data, is there a relationship between the type of housing unit (HousingUnitType) and the year the house was built (YearMade)? chisq_solution2 &lt;- recs_des %&gt;% svychisq( formula = ~ HousingUnitType + YearMade, design = ., statistic = &quot;Wald&quot;, na.rm = TRUE ) chisq_solution2 ## ## Design-based Wald test of association ## ## data: NextMethod() ## F = 32, ndf = 28, ddf = 95, p-value &lt;2e-16 In the ANES data, is there a difference in the distribution of gender (Gender) across early voting status in 2020 (EarlyVote2020)? chisq_solution3 &lt;- anes_des %&gt;% svychisq( formula = ~ Gender + EarlyVote2020, design = ., statistic = &quot;F&quot;, na.rm = TRUE ) chisq_solution3 ## ## Pearson&#39;s X^2: Rao &amp; Scott adjustment ## ## data: NextMethod() ## F = 0.27, ndf = 1, ddf = 51, p-value = 0.6 References "],["c07-modeling.html", "Chapter 7 Modeling 7.1 Introduction 7.2 Analysis of Variance (ANOVA) 7.3 Gaussian Linear regression 7.4 Logistic regression 7.5 Exercises", " Chapter 7 Modeling Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) For RECS, details are included in the RECS documentation and Chapter 3. recs_in &lt;-read_osf(&quot;recs_2015.rds&quot;) recs_des &lt;- recs_in %&gt;% as_survey_rep(weights = NWEIGHT, repweights = starts_with(&quot;BRRWT&quot;), type = &quot;Fay&quot;, rho = 0.5, mse = TRUE) 7.1 Introduction Modeling data is a way for researchers to investigate the relationship between a single dependent variable and one or more independent variables. This builds upon the analyses conducted in Chapter 6, which looked at the relationships between just two variables. For example, in Example 3 in Section 6.2.2, we investigated if there is a relationship between the electrical bill cost and whether or not the household used air-conditioning. However, there are potentially other elements that could go into what the cost of electrical bill is in a household (e.g., outside temperature, desired internal temperature, types and number of appliances, etc.). T-tests only allow us to investigate the relationship of one independent variable at a time, but using models we can look into multiple variables and even explore interactions between these variables. There are several types of models, but in this chapter we will cover Analysis of Variance (ANOVA) and linear regression models following common Gaussian and logit distributions. Jonas Kristoffer Lindeløv has an interesting discussion of many statistical tests and models being equivalent to a linear model. For example, a one-way ANOVA is a linear model with one categorical independent variable, and a two-sample t-test is an ANOVA where the independent variable has exactly two levels. When modeling data, it is helpful to first create an equation that provides an overview as to what it is that we are modeling. The main structure of these models is as follows: \\[y_i=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i + \\epsilon_i\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_p\\) as the associated coefficients, and \\(\\epsilon_i\\) is the error. Different models may not include an intercept, have interactions between different independent variables (\\(x_i\\)), or may have different underlying structures for the dependent variable (\\(y_i\\)). However, all linear models have the independent variables related to the dependent variable in a linear form. To specify these models in R, the formulas are the same with both raw data and survey data. The left side of the formula is the response/dependent variable, and the right side of the formula has the predictor/independent variable(s). There are many symbols used in R to specify the formula. For example, a linear formula mathematically specified as \\[Y_i=\\beta_0+\\beta_1 X_i+\\epsilon_i\\] would be specified in R as y~x where the intercept is not explicitly included. To fit a model with no intercept, that is, \\[Y_i=\\beta_1 X_i+\\epsilon_i\\] it can be specified as y~x-1. Formula notation details in R can be found in the help file for formula20. A quick overview of the common formula notation is in the following table: Common symbols in formula notation Symbol Example Meaning + +X include this variable - -X delete this variable : X:Z include the interaction between these variables * X*Z include these variables and the interactions between them ^n (X+Z+Y)^3 include these variables and all interactions up to n way I I(X-Z) as-as: include a new variable which is the difference of these variables There are often multiple ways to specify the same formula. For example, consider the following equation using the mtcars data \\[mpg_i=\\beta_0+\\beta_1cyl_{i}+\\beta_2disp_{i}+\\beta_3hp_{i}+\\beta_4cyl_{i}disp_{i}+\\beta_5cyl_{i}hp_{i}+\\beta_6disp_{i}hp_{i}+\\epsilon_i\\] This could be specified as any of the following: mpg~(cyl+disp+hp)^2 mpg~cyl+disp+hp+cyl:disp+cyl:hp+disp:hp mpg~cyl*disp+cyl*hp+disp*hp Note that the following two specifications are not the same: mpg~cyl:disp:hp this only has the interactions and not the main effect mpg~cyl*disp*hp this also has the 3-way interaction in addition to the main effects and 2-way interactions When using raw data, researchers will use the glm() function. With survey data, however, we use svyglm() from the {survey} package to ensure that we account for the survey design and weights in modeling21. This allows us to generalize a model to the target population and accounts for the fact that the observations in the survey data may not be independent. As discussed in Chapter 6, modeling survey data cannot be directly done in {srvyr}, but can be done in the {survey} (Lumley 2010, 2023) package. In this chapter, we will provide syntax and examples for linear models, including ANOVA, Gaussian linear regression, and logistic regression. For details on other types of regression, including ordinal regression, log-linear models, and survival analysis, refer to Lumley (2010). Lumley (2010) also discusses custom models such as a negative binomial or Poisson model in Appendix E of his book. 7.2 Analysis of Variance (ANOVA) In ANOVA, we are testing whether the mean of an outcome is the same across two or more groups. Statistically, we set up this as follows: \\(H_0: \\mu_1 = \\mu_2= \\dots = \\mu_k\\) where \\(\\mu_i\\) is the mean outcome for group \\(i\\) \\(H_A: \\text{At least one mean is different}\\) Some assumptions when using ANOVA on survey data include: The outcome variable is normally distributed within each group The variances of the outcome variable between each group are approximately equal We do NOT assume independence between the groups as with general ANOVA. The covariance is accounted for in the survey design 7.2.1 Syntax To perform this type of test in R, the general syntax is as follows: des_obj %&gt;% svyglm( design = ., outcomevar ~ groupvar, na.action = na.omit, df.resid = degf(.) ) where des_obj is a design object, outcomevar is the outcome variable, groupvar is the group variable, and na.action=na.omit is set so that records with missing data in the outcome or group variable are removed for prediction22. The function svyglm() does not have the design as the first argument so the dot (.) notation is used to pass it with a pipe (see Chapter 6 for more details). 7.2.2 Example Looking at an example will help us discuss the output and how to interpret the results. In RECS, respondents are asked what temperature they set their thermostat to during the day and evening when using the air-conditioning during the summer. To analyze this data, we filter the respondents to only those using AC (ACUsed). Then if we want to see if there are differences by region, we can use group_by(). A descriptive analysis of the temperature at night (SummerTempNight) set by region and the sample sizes is displayed below. recs_des %&gt;% filter(ACUsed) %&gt;% group_by(Region) %&gt;% summarise( SMN = survey_mean(SummerTempNight, na.rm = TRUE), n = unweighted(n()), n_na = unweighted(sum(is.na(SummerTempNight))) ) ## # A tibble: 4 × 5 ## Region SMN SMN_se n n_na ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; ## 1 Northeast 70.5 0.197 682 0 ## 2 Midwest 71.4 0.151 1235 0 ## 3 South 72.0 0.118 1917 0 ## 4 West 72.1 0.299 1115 0 In the following code, we test whether this temperature varies by region by first using svyglm() to run the test and then using broom::tidy() to display the output. Note that the temperature setting is set to NA when the household does not use air-conditioning, and thus na.action=na.omit is specified to ignore these cases. anova_out &lt;- recs_des %&gt;% svyglm(design = ., formula = SummerTempNight ~ Region, na.action = na.omit) tidy(anova_out) ## # A tibble: 4 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 70.5 0.197 358. 1.84e-146 ## 2 RegionMidwest 0.874 0.253 3.46 8.18e- 4 ## 3 RegionSouth 1.49 0.231 6.45 5.20e- 9 ## 4 RegionWest 1.66 0.353 4.70 9.27e- 6 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient. In this output, the intercept represents the reference value of the Northeast region23. The other coefficients indicate the difference in temperature relative to the Northeast region. For example, in the Midwest, temperatures are set, on average, 0.874 degrees higher than in the Northeast during summer nights. 7.3 Gaussian Linear regression Gaussian linear regression is a more generalized method than ANOVA where we fit a model of a continuous outcome with any number of categorical or continuous predictors, such that \\[y_i=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i + \\epsilon_i\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_p\\) as the associated coefficients, and \\(\\epsilon_i\\) is the error. Assumptions in Gaussian linear regression using survey data include: The residuals (\\(\\epsilon_i\\)) are normally distributed, but there is not an assumption of independence, and the correlation structure is captured in the survey design object There is a linear relationship between the outcome variable and the independent variables The residuals are homoscedastic, that is, the error term is the same across all values of independent variables 7.3.1 Syntax The syntax for linear regression uses the same function as ANOVA, but can have more than one variable listed on the right-hand side of the formula: des_obj %&gt;% svyglm( design = ., outcomevar ~ x1 + x2 + x3, na.action = na.omit, df.resid = degf(.) ) As discussed at the beginning of the chapter, the formula on the right-hand side can be specified in many ways, whether interactions are desired or not, for example. 7.3.2 Example On RECS, we can obtain information on the square footage of homes and the electric bills. We assume that square footage is related to the amount of money spent on electricity and examine a model for this. Before any modeling, we first plot the data to determine whether it is reasonable to assume a linear relationship. In the plot below, each hexagon represents the weighted count of households in the bin and we can see a general positive linear trend (as the square footage increases so does the amount of money spent on electricity). FIGURE 7.1: Relationship between square footage and dollars spent on electricity, RECS 2015 Given that the plot shows a potential relationship, fitting a model will allow us to determine if the relationship is statistically significant. The model is fit below with electricity expenditure as the outcome. m_electric_sqft &lt;- recs_des %&gt;% svyglm(design = ., formula = DOLLAREL ~ TOTSQFT_EN, na.action = na.omit) tidy(m_electric_sqft) ## # A tibble: 2 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 880. 26.3 33.4 5.51e-54 ## 2 TOTSQFT_EN 0.246 0.0134 18.4 6.01e-33 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient. In these results, we can say that, on average, for every additional square foot of house size, the electricity bill increases by 24.6 cents and that square footage is significantly associated with electricity expenditure. This is a very simple model, and there are likely many more factors in electricity expenditure, including the type of cooling, number of appliances, location, and more. However, often starting with one variable models can help researchers understand what potential relationships there are between variables before fitting more complex models. Often researchers start with known relationships before building models to determine what impact additional variables have on the model. In the following example, a model is fit to predict electricity expenditure, including Census region (factor/categorical), urbanicity (factor/categorical), square footage (double/numeric), and whether air-conditioning is used (logical/categorical) with all two-way interactions also included. As a reminder, using -1 means that we are fitting this model without an intercept. m_electric_multi &lt;- recs_des %&gt;% svyglm( design = ., formula = DOLLAREL ~ (Region + Urbanicity + TOTSQFT_EN + ACUsed)^2 - 1, na.action = na.omit ) tidy(m_electric_multi) %&gt;% print(n = 50) ## # A tibble: 25 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 RegionNortheast 6.19e+2 94.8 6.53 8.42e- 9 ## 2 RegionMidwest 4.98e+2 79.4 6.27 2.47e- 8 ## 3 RegionSouth 8.55e+2 97.4 8.78 6.01e-13 ## 4 RegionWest 4.76e+2 60.8 7.83 3.49e-11 ## 5 UrbanicityUrban Cluster -1.18e+2 116. -1.02 3.12e- 1 ## 6 UrbanicityRural 2.22e+2 107. 2.07 4.16e- 2 ## 7 TOTSQFT_EN 1.96e-1 0.0474 4.13 9.69e- 5 ## 8 ACUsedTRUE 2.85e+2 105. 2.72 8.30e- 3 ## 9 RegionMidwest:UrbanicityUrban … 3.16e+2 101. 3.13 2.51e- 3 ## 10 RegionSouth:UrbanicityUrban Cl… 4.00e+1 107. 0.373 7.10e- 1 ## 11 RegionWest:UrbanicityUrban Clu… 2.39e+2 112. 2.13 3.65e- 2 ## 12 RegionMidwest:UrbanicityRural 4.68e+2 89.8 5.21 1.77e- 6 ## 13 RegionSouth:UrbanicityRural 1.74e+2 99.2 1.76 8.31e- 2 ## 14 RegionWest:UrbanicityRural 1.64e+2 105. 1.57 1.21e- 1 ## 15 RegionMidwest:TOTSQFT_EN -5.24e-2 0.0364 -1.44 1.55e- 1 ## 16 RegionSouth:TOTSQFT_EN 6.08e-2 0.0426 1.43 1.58e- 1 ## 17 RegionWest:TOTSQFT_EN 3.50e-2 0.0441 0.793 4.30e- 1 ## 18 RegionMidwest:ACUsedTRUE -1.34e+2 110. -1.22 2.26e- 1 ## 19 RegionSouth:ACUsedTRUE -1.36e+2 106. -1.29 2.02e- 1 ## 20 RegionWest:ACUsedTRUE -6.26e+1 100. -0.625 5.34e- 1 ## 21 UrbanicityUrban Cluster:TOTSQF… -2.09e-2 0.0337 -0.619 5.38e- 1 ## 22 UrbanicityRural:TOTSQFT_EN -7.89e-2 0.0269 -2.93 4.56e- 3 ## 23 UrbanicityUrban Cluster:ACUsed… -3.37e+1 88.5 -0.380 7.05e- 1 ## 24 UrbanicityRural:ACUsedTRUE 3.10e+0 89.7 0.0345 9.73e- 1 ## 25 TOTSQFT_EN:ACUsedTRUE 5.20e-2 0.0371 1.40 1.65e- 1 As shown above, there are many terms in this model. To test whether coefficients for a term are different from zero, the function regTermTest() can be used. For example, in the above regression, we can test whether the interaction of region and urbanicity is significant as follows: urb_reg_test &lt;- regTermTest(m_electric_multi, ~Urbanicity:Region) urb_reg_test ## Wald test for Urbanicity:Region ## in svyglm(design = ., formula = DOLLAREL ~ (Region + Urbanicity + ## TOTSQFT_EN + ACUsed)^2 - 1, na.action = na.omit) ## F = 7.167 on 6 and 71 df: p= 5.3e-06 As demonstrated above, there is a significant interaction between urbanicity and region (p-value=5.34^{-6}). To examine the predictions, residuals and more from the model, the function augment() from {broom} can be used. The augment() function will return a tibble with the independent and dependent variables and other fit statistics. The augment() function has not been specifically written for objects of class svyglm, and as such, a warning will be displayed indicating this at this time. As it was not written exactly for this class of objects, a little tweaking needs to be done after using augment to get the predicted (.fitted) and standard error (.se.fit) values. To obtain the standard error of the fitted values we need to use the attr() function on the .fitted values created by augment(). fitstats &lt;- augment(m_electric_multi) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), .fitted = as.numeric(.fitted)) ## Warning: The `augment()` method for objects of class `svrepglm` is not maintained by the broom team, and is only supported through the `glm` tidier method. Please be cautious in interpreting and reporting broom output. ## ## This warning is displayed once per session. fitstats ## # A tibble: 5,686 × 13 ## DOLLAREL Region Urbanicity TOTSQFT_EN ACUsed `(weights)` .fitted ## &lt;dbl&gt; &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 967. West Urban Area 1808 TRUE 0.581 1209. ## 2 1366 South Rural 759 TRUE 0.693 1577. ## 3 2428. South Urban Area 2288 TRUE 1.12 1710. ## 4 1313. Midwest Urban Clust… 2573 TRUE 0.585 1261. ## 5 581. Northeast Urban Area 1024 TRUE 0.804 1157. ## 6 494 Northeast Urban Clust… 623 TRUE 1.25 893. ## 7 372. South Urban Area 832 TRUE 2.21 1261. ## 8 1737. Midwest Rural 2682 TRUE 0.735 1654. ## 9 1532. Northeast Urban Area 1900 TRUE 0.675 1375. ## 10 1974. South Urban Area 1230 TRUE 2.09 1383. ## # ℹ 5,676 more rows ## # ℹ 6 more variables: .resid &lt;dbl&gt;, .hat &lt;dbl&gt;, .sigma &lt;dbl&gt;, ## # .cooksd &lt;dbl&gt;, .std.resid &lt;dbl&gt;, .se.fit &lt;dbl&gt; These results can then be used in a variety of ways, including examining residual plots as illustrated below: fitstats %&gt;% ggplot(aes(x = .fitted, .resid)) + geom_point() + geom_hline(yintercept = 0, colour = &quot;red&quot;) + theme_minimal() + xlab(&quot;Fitted value of electricity cost&quot;) + ylab(&quot;Residual of model&quot;) FIGURE 7.2: Residual plot of electric cost model with covariates Region, Urbanicity, TOTSQFT_EN, and ACUsed Additionally, augment() can be used to predict outcomes for data not used in modeling. Perhaps, we would like to predict the energy expenditure for a home in an urban area in the south that uses air-conditioning and is 2,500 square feet. First, make a tibble including that additional data and then use the newdata argument in the augment function. As before, to obtain the standard error of the predicted values we need to use the attr() function. add_data &lt;- recs_in %&gt;% select(DOEID, Region, Urbanicity, TOTSQFT_EN, ACUsed, DOLLAREL) %&gt;% rbind( tibble( DOEID = NA, Region = &quot;South&quot;, Urbanicity = &quot;Urban Area&quot;, TOTSQFT_EN = 2500, ACUsed = TRUE, DOLLAREL = NA ) ) %&gt;% tail(1) pred_data &lt;- augment(m_electric_multi, newdata = add_data) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), .fitted = as.numeric(.fitted)) pred_data ## # A tibble: 1 × 8 ## DOEID Region Urbanicity TOTSQFT_EN ACUsed DOLLAREL .fitted .se.fit ## &lt;dbl&gt; &lt;fct&gt; &lt;fct&gt; &lt;dbl&gt; &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 NA South Urban Area 2500 TRUE NA 1775. 37.2 In the above example, it is predicted that the energy expenditure would be $1775.48. 7.4 Logistic regression Logistic regression is used to model a binary outcome and is a specific generalized linear model (GLM). A GLM uses a link function to link the response variable to the linear model. In logistic regression, the link model is the logit function. Specifically, the model is specified as follows: \\[ y_i \\sim \\text{Bernoulli}(\\pi_i)\\] \\[\\begin{equation} \\log \\left(\\frac{\\pi_i}{1-\\pi_i} \\right)=\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\tag{7.1} \\end{equation}\\] which can be re-expressed as \\[ \\pi_i=\\frac{\\exp \\left(\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\right)}{1+\\exp \\left(\\beta_0 +\\sum_{i=1}^p \\beta_i x_i \\right)}.\\] where \\(y_i\\) is the outcome, \\(\\beta_0\\) is an intercept, and \\(x_1, \\cdots, x_n\\) are the predictors with \\(\\beta_1, \\cdots, \\beta_n\\) as the associated coefficients. Assumptions in logistic regression using survey data include: The outcome variable has two levels There is a linear relationship between the independent variables and the log odds (Equation (7.1)) The residuals are homoscedastic, that is, the error term is the same across all values of independent variables 7.4.1 Syntax The syntax for logistic regression is as follows: des_obj %&gt;% svyglm( design = ., outcomevar ~ x1 + x2 + x3, na.action = na.omit, df.resid = degf(.), family = quasibinomial #use this to avoid warning about non-integers ) Note svyglm() is the same function used in both ANOVA and linear regression. However, we’ve added the link function quasibinomial. While we can use the binomial link function, it is recommended to use the quasibinomial as our weights may not be integers, and the quasibinomial also allows for overdispersion. The quasibinomial family has a default logit link which is what is specified in the equations above. When specifying the outcome variable, it will likely be specified in one of two ways with survey data: A factor variable where not being the first level of the factor indicates a “success” A numeric variable which is 1 or 0 where 1 indicates a success A logical variable where TRUE indicates a success 7.4.2 Example In the following example, the ANES data is used, and we are modeling whether someone usually has trust in the government24 by who someone voted for in 2020. As a reminder, the leading candidates were Biden and Trump though people could vote for someone else not in the Democratic or Republican parties. Those votes are all grouped into an “Other” category. We first create a binary outcome for trusting in the government and plot the data. A scatter plot of the raw data is not useful as it is all 0 and 1 outcomes, so instead, we plot a summary of the data. anes_des_der &lt;- anes_des %&gt;% mutate(TrustGovernmentUsually = case_when( is.na(TrustGovernment) ~ NA, TRUE ~ TrustGovernment %in% c(&quot;Always&quot;, &quot;Most of the time&quot;) )) anes_des_der %&gt;% group_by(VotedPres2020_selection) %&gt;% summarise( pct_trust = survey_mean( TrustGovernmentUsually, na.rm = TRUE, proportion = TRUE, vartype = &quot;ci&quot; ), .groups = &quot;drop&quot; ) %&gt;% filter(complete.cases(.)) %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) + scale_fill_manual(values = c(&quot;#0b3954&quot;, &quot;#bfd7ea&quot;, &quot;#8d6b94&quot;)) + xlab(&quot;Election choice (2022)&quot;) + ylab(&quot;Usually trust the government&quot;) + scale_y_continuous(labels = scales::percent) + guides(fill = &quot;none&quot;) + theme_minimal() FIGURE 7.3: Relationship between candidate selection and trust in government, ANES 2020 Next, we fit the model. logistic_trust_vote &lt;- anes_des_der %&gt;% svyglm( design = ., formula = TrustGovernmentUsually ~ VotedPres2020_selection , family = quasibinomial ) tidy(logistic_trust_vote) ## # A tibble: 3 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) -1.96 0.0740 -26.5 1.10e-30 ## 2 VotedPres2020_selectionTrump 0.397 0.102 3.90 2.89e- 4 ## 3 VotedPres2020_selectionOther -0.768 0.462 -1.66 1.03e- 1 tidy(logistic_trust_vote, exponentiate = TRUE) %&gt;% select(term, estimate) ## # A tibble: 3 × 2 ## term estimate ## &lt;chr&gt; &lt;dbl&gt; ## 1 (Intercept) 0.141 ## 2 VotedPres2020_selectionTrump 1.49 ## 3 VotedPres2020_selectionOther 0.464 In the output above, we can see the estimated coefficients (estimate), estimated standard errors of the coefficients (std.error), the t-statistic (statistic), and the p-value for each coefficient when the tidy() function is run the first time. The second time the tidy function is used, the coefficients are exponentiated, which illustrates the odds. In this example, we can interpret this as saying that the odds of trusting in government for someone who voted for Trump is 149% as likely to trust the government compared to a person who voted for Biden (the reference level). In comparison, a person who voted for neither Biden nor Trump is 46.4% as likely to trust the government as someone who voted for Biden. As with linear regression, the augment() can be used to predict values. By default, the prediction is the link function and not the probability. To predict the probability, add an argument of type.predict=\"response\" as demonstrated below: logistic_trust_vote %&gt;% augment(type.predict = &quot;response&quot;) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) %&gt;% select(TrustGovernmentUsually, VotedPres2020_selection, .fitted, .se.fit) ## Warning: The `augment()` method for objects of class `svyglm` is not maintained by the broom team, and is only supported through the `glm` tidier method. Please be cautious in interpreting and reporting broom output. ## ## This warning is displayed once per session. ## # A tibble: 5,860 × 4 ## TrustGovernmentUsually VotedPres2020_selection .fitted .se.fit ## &lt;lgl&gt; &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE Other 0.0614 0.0266 ## 2 FALSE Biden 0.124 0.00801 ## 3 FALSE Biden 0.124 0.00801 ## 4 FALSE Trump 0.173 0.00959 ## 5 FALSE Biden 0.124 0.00801 ## 6 FALSE Trump 0.173 0.00959 ## 7 FALSE Biden 0.124 0.00801 ## 8 TRUE Biden 0.124 0.00801 ## 9 FALSE Biden 0.124 0.00801 ## 10 FALSE Trump 0.173 0.00959 ## # ℹ 5,850 more rows 7.5 Exercises The type of housing unit may have an impact on energy expenses. Is there any relationship between housing unit type (HousingUnitType) and total energy expenditure (TOTALDOL)? First, find the average energy expenditure by housing unit type as a descriptive analysis and then do the test. The reference level in the comparison should be the housing unit type that is most common. recs_des %&gt;% group_by(HousingUnitType) %&gt;% summarise(Expense = survey_mean(TOTALDOL, na.rm = TRUE), HUs = survey_total()) %&gt;% arrange(desc(HUs)) ## # A tibble: 5 × 5 ## HousingUnitType Expense Expense_se HUs HUs_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Single-family detached 2193. 22.0 73871041. 0.00645 ## 2 Apartment: 5 or more units 1048. 21.2 21147792. 0.00271 ## 3 Apartment: 2-4 Units 1332. 36.6 9392242. 0.000239 ## 4 Single-family attached 1607. 33.8 7010132. 0.0253 ## 5 Mobile home 1752. 60.2 6787043. 0.0183 exp_unit_out &lt;- recs_des %&gt;% mutate(HousingUnitType = fct_infreq(HousingUnitType, NWEIGHT)) %&gt;% svyglm( design = ., formula = TOTALDOL ~ HousingUnitType, na.action = na.omit ) tidy(exp_unit_out) ## # A tibble: 5 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 2193. 22.0 99.6 1.07e-94 ## 2 HousingUnitTypeApartment: 5 or … -1145. 30.6 -37.4 5.08e-57 ## 3 HousingUnitTypeApartment: 2-4 U… -861. 42.2 -20.4 1.06e-35 ## 4 HousingUnitTypeSingle-family at… -585. 41.2 -14.2 8.02e-25 ## 5 HousingUnitTypeMobile home -441. 61.8 -7.13 2.33e-10 # Single-family detached units are most common # There is a significant relationship between energy expenditure and housing unit type Does temperature play a role in energy expenditure? Cooling degree days are a measure of how hot a place is. CDD65 for a given day indicates the number of degrees Fahrenheit warmer than 65°F (18.3°C) it is in a location. On a day that averages 65°F and below, CDD65=0. While a day that averages 85°F would have CDD80=20 because it is 20 degrees warmer. For each day in the year, this is summed to give an indicator of how hot the place is throughout the year. Similarly, HDD65 indicates the days colder than 65°F (18.3°C)25. Can energy expenditure be predicted using these temperature indicators along with square footage? Is there a significant relationship? Include main effects and two-way interactions. temps_sqft_exp &lt;- recs_des %&gt;% svyglm( design = ., formula = DOLLAREL ~ (TOTSQFT_EN + CDD65 + HDD65) ^ 2, na.action = na.omit ) tidy(temps_sqft_exp) ## # A tibble: 7 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 (Intercept) 353. 110. 3.20 0.00193 ## 2 TOTSQFT_EN 0.420 0.0643 6.53 0.00000000398 ## 3 CDD65 0.128 0.0321 4.00 0.000128 ## 4 HDD65 0.0399 0.0172 2.32 0.0226 ## 5 TOTSQFT_EN:CDD65 -0.00000150 0.0000177 -0.0852 0.932 ## 6 TOTSQFT_EN:HDD65 -0.0000366 0.00000922 -3.97 0.000143 ## 7 CDD65:HDD65 0.0000232 0.00000835 2.78 0.00660 Continuing with our results from question 2, create a plot between the actual and predicted expenditures and a residual plot for the predicted expenditures. temps_sqft_exp_fit &lt;- temps_sqft_exp %&gt;% augment() %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) temps_sqft_exp_fit %&gt;% ggplot(aes(x = DOLLAREL, y = .fitted)) + geom_point() + geom_abline(intercept = 0, slope = 1, colour = &quot;red&quot;) + xlab(&quot;Actual expenditures&quot;) + ylab(&quot;Predicted expenditures&quot;) + theme_minimal() FIGURE 7.4: Actual and predicted electricity expenditures temps_sqft_exp_fit %&gt;% ggplot(aes(x = .fitted, y = .resid)) + geom_point() + geom_hline(yintercept = 0, colour = &quot;red&quot;) + xlab(&quot;Predicted expenditure&quot;) + ylab(&quot;Residual value of expenditure&quot;) + theme_minimal() FIGURE 7.5: Residual plot of electric cost model with covariates TOTSQFT_EN, CDD65, and HDD65 Early voting expanded in 202026. Build a logistic model predicting early voting in 2020 (EarlyVote2020) using age (Age), education (Education), and party identification (PartyID). Include two-way interactions. earlyvote_mod &lt;- anes_des %&gt;% filter(!is.na(EarlyVote2020)) %&gt;% svyglm( design = ., formula = EarlyVote2020 ~ (Age + Education + PartyID) ^ 2 , family = quasibinomial ) tidy(earlyvote_mod) %&gt;% arrange(p.value) ## # A tibble: 46 × 5 ## term estimate std.error statistic p.value ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Age:PartyIDIndependent -0.0585 0.0163 -3.58 0.0116 ## 2 PartyIDIndependent 4.99 1.62 3.09 0.0214 ## 3 Age:PartyIDNot very strong repu… -0.0494 0.0197 -2.51 0.0460 ## 4 PartyIDNot very strong republic… 4.03 1.64 2.46 0.0488 ## 5 (Intercept) 1.57 0.870 1.80 0.121 ## 6 EducationGraduate 1.47 0.973 1.51 0.183 ## 7 EducationHigh school:PartyIDStr… -1.33 1.00 -1.32 0.235 ## 8 PartyIDStrong republican 1.63 1.28 1.27 0.253 ## 9 EducationGraduate:PartyIDStrong… -1.21 0.998 -1.21 0.272 ## 10 EducationPost HS:PartyIDIndepen… -1.50 1.35 -1.11 0.311 ## # ℹ 36 more rows Continuing from Exercise 1, predict the probability of early voting for two people. Both are 28 years old and have a graduate degree, but one person is a strong Democrat, and the other is a strong Republican. add_vote_dat &lt;- anes_in %&gt;% select(EarlyVote2020, Age, Education, PartyID) %&gt;% rbind(tibble( EarlyVote2020 = NA, Age = 28, Education = &quot;Graduate&quot;, PartyID = c(&quot;Strong democrat&quot;, &quot;Strong republican&quot;) )) %&gt;% tail(2) log_ex_2_out &lt;- earlyvote_mod %&gt;% augment(newdata = add_vote_dat, type.predict = &quot;response&quot;) %&gt;% mutate(.se.fit = sqrt(attr(.fitted, &quot;var&quot;)), # extract the variance of the fitted value .fitted = as.numeric(.fitted)) References "],["c08-communicating-results.html", "Chapter 8 Communicating Results 8.1 Introduction 8.2 Describing Results through Text 8.3 Visualizing Data 8.4 Reproducibility", " Chapter 8 Communicating Results Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(survey) library(srvyr) library(broom) library(gt) library(gtsummary) library(BrailleR) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from ANES. Here is the code to create the ANES design object that will be used throughout the chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information). anes_in &lt;- read_osf(&quot;anes_2020.rds&quot;) %&gt;% mutate(Weight = Weight / sum(Weight) * targetpop) anes_des &lt;- anes_in %&gt;% as_survey_design( weights = Weight, strata = Stratum, ids = VarUnit, nest = TRUE) 8.1 Introduction One of the most important aspects of data analysis is communicating the results to others. This could include other researchers familiar with our survey data or others who will be seeing this data and results for the first time. Ensuring that we are accurately discussing the methodology, analysis, and displaying results is crucial to making sure our audience comprehends what the results are saying. It is our responsibility to discuss and present the results carefully. Before beginning any dissemination of results, it is important to understand the audience. Some questions we might consider about our audience include: What medium will results be presented? Examples might include website, print media. Based on the media type, we might limit or enhance the use of graphical representation. How much does the audience know about the study and/or data being presented? Audiences can range from the general public to data experts. If you don’t expect your audience to know much about the study, be sure to describe it (see later recommendations) What are we trying to communicate? This could be summary statistics, trends, patterns, and more. Summary statistics might be best presented in tables but trends and patterns might be better shared with plots. Is the audience accustomed to looking at plots? If not, we might want to add text to describe how to read plots What level of statistics knowledge does the audience have? If the audience does not have a strong statistics background, it could be useful to include text on standard errors, confidence intervals, and other estimate types that are being shared 8.2 Describing Results through Text As researchers, we often focus on the data itself; communicating the results effectively can be a forgotten step. However, all of the steps that we as researchers need to consider when conducting analyses must be communicated to our audience as well. The first few chapters of this book Chapters 2 through 4 provided insights into what we need to consider when conducting analyses. Each of these topics should also be considered when presenting results to others. 8.2.1 Methodology If we are using existing data, methodologically sound surveys will provide documentation about how the survey was fielded, the questionnaires, and the needed information for analyses. For example, the survey’s methodology reports should include the population of interest, sampling procedures, response rates, questionnaire documentation, weighting, and general overview of disclosure statements. Many American organizations are part of the American Association for Public Opinion Research’s (AAPOR) Transparency Initiative, which requires the organization to include specific details in their methodology to ensure that people understand the context in which analyses can and should be conducted from each survey. Being transparent about these methods is crucial for the scientific rigor of the field. When using publicly available data, such as with the examples in this book, oftentimes we can link to the methodology report in our final output. However, it is still important to provide the high-level information that will make it easy for the audience to understand at a quick glance the context around the findings. For example, indicating who (age or other population information), where the study was done, and when the study was done helps the audience understand how generalizable the results are. Including the question wording will also ensure that the audience understands the context and limitations if the response options are narrow. The details provided in Chapter 2 about what we as researchers need to consider when analyzing the data should also be provided to the audience when presenting the results. The inclusion of this material is especially important if no methodology report exists for the data used in the analyses. For example, if the researcher conducted the survey for the purposes of this analysis, then including as much information about the survey as possible in the write-up and dissemination of the findings is crucial. Following the AAPOR Transparency Initiative guidelines is a good way to ensure all necessary information is provided to the audience. 8.2.2 Analysis In addition to how the survey was conducted and weights were calculated, providing information about what data prep, cleaning, and analyses were used to obtain these results is also important. For example, in Chapter 6 we compared the distributions of education from the survey to the ACS. To do this, we needed to collapse education categories provided in the ANES data to match the ACS. Providing both the original question wording and response options and the steps taken to map to the ACS data are important for the audience to know to ensure transparency and a better understanding of the results. This particular example may seem obvious (combining Bachelor’s Degree and Graduate Degree into a single category). Still, there are cases where re-coding or handling of missing data is more important to disclose as there could be multiple ways to handle the data and the choice we made as researchers were just one of many. For example, many examples and exercises in this book remove missing data, as this is often the easiest way to handle missing data. However, in some cases, missing data could be a substantively important piece of information, and removing it could bias results. Disclosing how data was handled is crucial in helping the audience better understand the results. 8.2.3 Results Presenting and communicating results is more than just displaying a table with data or a nice-looking graph. Adding context around point estimates or model coefficients is important for helping the audience understand what the data mean. We, as researchers, can do a couple of things to help the audience understand the data. First, we can present the important data points in a sentence. For example, if we were looking at election polling data conducted before an election, we could say something like: As of [DATE], an estimated XX% of U.S. registered voters say they will vote for [CANDITATE NAME] for president in the [YEAR] general election. This sentence provides a few key pieces of information for the audience: [DATE]: Given that polling data is dependent on a point in time, providing the date of reference is important for understanding when this data is valid. U.S. registered voters: This is the target population, and by including this information, we are telling the audience the population for reference and who was surveyed. [CANDITATE NAME] for president: This provides the information on the estimate. The number is the percentage of those voting for a specific candidate for a specific office. [YEAR] general election: As with the bullet above, this information provides more context around the specific election and year. The estimate would take on a different meaning if we changed it to a primary election, for example. This sentence also includes the word “estimated.” When presenting results in aggregate from surveys, it is important not to talk about estimates in the absolute as we have errors around each estimate. Using words like “estimated,” “on average,” or “around” will help convey the uncertainty with a given value. Including that uncertainty is even more helpful to researchers. For example, a sentence could include the uncertainty with the margin of error such as “XX% (+/- Y%).” Confidence intervals can also be incorporated into the text to assist readers. Second, providing context and discussion around the meaning of the point can help the audience glean some insight into why the data is important. For example, when comparing two points, it could be helpful to indicate that there are statistically significant differences and the impact and usefulness of this information. This is where it is important as researchers to do our best to keep biases in check and present only the facts logically. If speculation is included, using statements like “the authors speculate” or “these findings may indicate” help relay the uncertainty around the notion while still lending a plausible solution. Additionally, researchers can present a few alternatives or competing discussion points to explain the results’ uncertainty further. It is important to remember that how we, as researchers, discuss these findings can greatly impact how the audience interprets the findings. Therefore, we should take extreme caution when talking about and presenting results. 8.3 Visualizing Data Including data tables and graphs are used to portray a large amount of data in a concise manner. Although discussing key findings in the text is important, it is often easier for the audience to digest large amounts of data in graphical or table format. When used correctly, combining text, tables, and graphs is extremely powerful in presenting results. This section provides examples of using the {gtsummary} and {ggplot} packages to enhance the dissemination of results. 8.3.1 Tables Tables are a great way to provide a large amount of data when we want the individual data points to be read. However, it is important to present tables in a readable format. Numbers should be aligned, and rows and columns should be easy to follow. Using key visualization techniques, we can create tables that are informative and nice to look at. Many packages can be used to create easy-to-read tables (e.g., {kable} + {kableExtra}, {gt}, {gtsummary}, {DT}, {formattable}, {flextable}, {reactable}). We will focus on {gt} here, but we encourage learning about others as they may have additional helpful components. We like the {gt} package as it is flexible, pipeable, and has many extensions to make beautiful tables. At this time, {gtsummary} is not well enough developed for recommended wide use for survey analysis. It lacks the ability to work with replicate designs. We provide one example using {gtsummary} and hope it can become a better tool over time. srvyr output to gt table Let’s start by using some of the data we calculated earlier in this book. In Chapter 6, we looked at data on trust in government with the proportions calculated below: trust_gov &lt;- anes_des %&gt;% drop_na(TrustGovernment) %&gt;% group_by(TrustGovernment) %&gt;% summarize(trust_gov_p = survey_prop()) trust_gov ## # A tibble: 5 × 3 ## TrustGovernment trust_gov_p trust_gov_p_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Always 0.0155 0.00204 ## 2 Most of the time 0.132 0.00553 ## 3 About half the time 0.309 0.00829 ## 4 Some of the time 0.434 0.00855 ## 5 Never 0.110 0.00566 The native output that R produces may work for initial viewing inside RStudio or when creating basic output with an R Markdown or Quarto document. However, when viewing these results in other publications, such as the print version of this book or for more official dissemination, adjusting the display can make it easier for users to follow. Looking at the output from trust_gov, there are a couple of items that are probably obvious to fix: (1) use percentages instead of proportions and (2) the variable names as the column headers. The {gt} package is a good tool for both implementing this better labeling and creating tables publishable tables. In the code below, we implement a few changes to improve the table usefulness. We begin with the gt() function to initiate the table and use the argument rowname_col to make the TrustGovernment column as the table stub. The cols_label() function is used to create informative column labels. The function tab_spanner() is applied to add a label across multiple columns. In this case, we apply the label “Trust in Government, 2020” across all the columns except the stub. Finally, the fmt_percent() function is used to format the proportions into percentages and reduce the number of decimals shown. trust_gov_gt &lt;- trust_gov %&gt;% gt(rowname_col = &quot;TrustGovernment&quot;) %&gt;% cols_label(trust_gov_p = &quot;%&quot;, trust_gov_p_se = &quot;s.e. (%)&quot;) %&gt;% tab_spanner(label = &quot;Trust in Government, 2020&quot;, columns = c(trust_gov_p, trust_gov_p_se)) %&gt;% fmt_percent(decimals = 1) trust_gov_gt #smfkpkdnvz table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #smfkpkdnvz thead, #smfkpkdnvz tbody, #smfkpkdnvz tfoot, #smfkpkdnvz tr, #smfkpkdnvz td, #smfkpkdnvz th { border-style: none; } #smfkpkdnvz p { margin: 0; padding: 0; } #smfkpkdnvz .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #smfkpkdnvz .gt_caption { padding-top: 4px; padding-bottom: 4px; } #smfkpkdnvz .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #smfkpkdnvz .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #smfkpkdnvz .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #smfkpkdnvz .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #smfkpkdnvz .gt_column_spanner_outer:first-child { padding-left: 0; } #smfkpkdnvz .gt_column_spanner_outer:last-child { padding-right: 0; } #smfkpkdnvz .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #smfkpkdnvz .gt_spanner_row { border-bottom-style: hidden; } #smfkpkdnvz .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #smfkpkdnvz .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #smfkpkdnvz .gt_from_md > :first-child { margin-top: 0; } #smfkpkdnvz .gt_from_md > :last-child { margin-bottom: 0; } #smfkpkdnvz .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #smfkpkdnvz .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #smfkpkdnvz .gt_row_group_first td { border-top-width: 2px; } #smfkpkdnvz .gt_row_group_first th { border-top-width: 2px; } #smfkpkdnvz .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #smfkpkdnvz .gt_first_summary_row.thick { border-top-width: 2px; } #smfkpkdnvz .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #smfkpkdnvz .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #smfkpkdnvz .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #smfkpkdnvz .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #smfkpkdnvz .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #smfkpkdnvz .gt_left { text-align: left; } #smfkpkdnvz .gt_center { text-align: center; } #smfkpkdnvz .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #smfkpkdnvz .gt_font_normal { font-weight: normal; } #smfkpkdnvz .gt_font_bold { font-weight: bold; } #smfkpkdnvz .gt_font_italic { font-style: italic; } #smfkpkdnvz .gt_super { font-size: 65%; } #smfkpkdnvz .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #smfkpkdnvz .gt_asterisk { font-size: 100%; vertical-align: 0; } #smfkpkdnvz .gt_indent_1 { text-indent: 5px; } #smfkpkdnvz .gt_indent_2 { text-indent: 10px; } #smfkpkdnvz .gt_indent_3 { text-indent: 15px; } #smfkpkdnvz .gt_indent_4 { text-indent: 20px; } #smfkpkdnvz .gt_indent_5 { text-indent: 25px; } Trust in Government, 2020 % s.e. (%) Always 1.6% 0.2% Most of the time 13.2% 0.6% About half the time 30.9% 0.8% Some of the time 43.4% 0.9% Never 11.0% 0.6% A few more things we can add are a title, a data source note, and a footnote with the question information using the functions tab_header(), tab_source_note(), and tab_footnote(). trust_gov_gt %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #pnruhcqurs table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #pnruhcqurs thead, #pnruhcqurs tbody, #pnruhcqurs tfoot, #pnruhcqurs tr, #pnruhcqurs td, #pnruhcqurs th { border-style: none; } #pnruhcqurs p { margin: 0; padding: 0; } #pnruhcqurs .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #pnruhcqurs .gt_caption { padding-top: 4px; padding-bottom: 4px; } #pnruhcqurs .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #pnruhcqurs .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #pnruhcqurs .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #pnruhcqurs .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #pnruhcqurs .gt_column_spanner_outer:first-child { padding-left: 0; } #pnruhcqurs .gt_column_spanner_outer:last-child { padding-right: 0; } #pnruhcqurs .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #pnruhcqurs .gt_spanner_row { border-bottom-style: hidden; } #pnruhcqurs .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #pnruhcqurs .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #pnruhcqurs .gt_from_md > :first-child { margin-top: 0; } #pnruhcqurs .gt_from_md > :last-child { margin-bottom: 0; } #pnruhcqurs .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #pnruhcqurs .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #pnruhcqurs .gt_row_group_first td { border-top-width: 2px; } #pnruhcqurs .gt_row_group_first th { border-top-width: 2px; } #pnruhcqurs .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #pnruhcqurs .gt_first_summary_row.thick { border-top-width: 2px; } #pnruhcqurs .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #pnruhcqurs .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #pnruhcqurs .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #pnruhcqurs .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #pnruhcqurs .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #pnruhcqurs .gt_left { text-align: left; } #pnruhcqurs .gt_center { text-align: center; } #pnruhcqurs .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #pnruhcqurs .gt_font_normal { font-weight: normal; } #pnruhcqurs .gt_font_bold { font-weight: bold; } #pnruhcqurs .gt_font_italic { font-style: italic; } #pnruhcqurs .gt_super { font-size: 65%; } #pnruhcqurs .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #pnruhcqurs .gt_asterisk { font-size: 100%; vertical-align: 0; } #pnruhcqurs .gt_indent_1 { text-indent: 5px; } #pnruhcqurs .gt_indent_2 { text-indent: 10px; } #pnruhcqurs .gt_indent_3 { text-indent: 15px; } #pnruhcqurs .gt_indent_4 { text-indent: 20px; } #pnruhcqurs .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 Trust in Government, 2020 % s.e. (%) Always 1.6% 0.2% Most of the time 13.2% 0.6% About half the time 30.9% 0.8% Some of the time 43.4% 0.9% Never 11.0% 0.6% American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 8.3.1.1 gtsummary The {gtsummary} package is a package that simultaneously summarizes data and creates publication-ready tables. Its origins are in clinical trial data but has been extended to include survey analysis in some limited ways. At this time, it only works with survey objects using Taylor’s Series Linearization and not replicate methods. A limited set of summary statistics are available. For categorical variable,s the following summary statistics are available: {n} frequency {N} denominator, or cohort size {p} percentage {p.std.error} standard error of the sample proportion computed with [survey::svymean()] {deff} design effect of the sample proportion computed with [survey::svymean()] {n_unweighted} unweighted frequency {N_unweighted} unweighted denominator {p_unweighted} unweighted formatted percentage For continuous variables, the following summary statistics are available: {median} median {mean} mean {mean.std.error} standard error of the sample mean computed with [survey::svymean()] {deff} design effect of the sample mean computed with [survey::svymean()] {sd} standard deviation {var} variance {min} minimum {max} maximum {p##} any integer percentile, where ## is an integer from 0 to 100 {sum} sum In the following example, we will build up a table using {gtsummary} which will be similar to the table in the {gt} example. The main function used is tbl_svysummary(). In this function, the variables we want to analyze are included in the include argument and the statistics we want to display are in the statistic argument. To specify statistics, the syntax from the {glue} package are used where variables you want to insert is included inside curly brackets. To specify that we want, the proportion followed by the standard error of the proportion in parentheses, we use “{p} ({p.std.error})”. You must specify the statistics you want using the names of the statistics in the two lists above. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;) ) #gxskozkogi table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #gxskozkogi thead, #gxskozkogi tbody, #gxskozkogi tfoot, #gxskozkogi tr, #gxskozkogi td, #gxskozkogi th { border-style: none; } #gxskozkogi p { margin: 0; padding: 0; } #gxskozkogi .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #gxskozkogi .gt_caption { padding-top: 4px; padding-bottom: 4px; } #gxskozkogi .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #gxskozkogi .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #gxskozkogi .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #gxskozkogi .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #gxskozkogi .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #gxskozkogi .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #gxskozkogi .gt_column_spanner_outer:first-child { padding-left: 0; } #gxskozkogi .gt_column_spanner_outer:last-child { padding-right: 0; } #gxskozkogi .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #gxskozkogi .gt_spanner_row { border-bottom-style: hidden; } #gxskozkogi .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #gxskozkogi .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #gxskozkogi .gt_from_md > :first-child { margin-top: 0; } #gxskozkogi .gt_from_md > :last-child { margin-bottom: 0; } #gxskozkogi .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #gxskozkogi .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #gxskozkogi .gt_row_group_first td { border-top-width: 2px; } #gxskozkogi .gt_row_group_first th { border-top-width: 2px; } #gxskozkogi .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #gxskozkogi .gt_first_summary_row.thick { border-top-width: 2px; } #gxskozkogi .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #gxskozkogi .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #gxskozkogi .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #gxskozkogi .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #gxskozkogi .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #gxskozkogi .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #gxskozkogi .gt_left { text-align: left; } #gxskozkogi .gt_center { text-align: center; } #gxskozkogi .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #gxskozkogi .gt_font_normal { font-weight: normal; } #gxskozkogi .gt_font_bold { font-weight: bold; } #gxskozkogi .gt_font_italic { font-style: italic; } #gxskozkogi .gt_super { font-size: 65%; } #gxskozkogi .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #gxskozkogi .gt_asterisk { font-size: 100%; vertical-align: 0; } #gxskozkogi .gt_indent_1 { text-indent: 5px; } #gxskozkogi .gt_indent_2 { text-indent: 10px; } #gxskozkogi .gt_indent_3 { text-indent: 15px; } #gxskozkogi .gt_indent_4 { text-indent: 20px; } #gxskozkogi .gt_indent_5 { text-indent: 25px; } Characteristic N = 231,592,6931 TrustGovernment     Always 1.6 (0.00)     Most of the time 13 (0.01)     About half the time 31 (0.01)     Some of the time 43 (0.01)     Never 11 (0.01)     Unknown 675,402 1 % (SE(%)) In this default table, the weighted number of missing (or Unknown) records is included. Additionally, the standard error is reported as a proportion while the proportion is styled as a percentage. In the next step, we remove the Unknown category by setting the missing argument to “no” and format the standard error as a percentage within the digits argument. Finally, we label the “TrustGovernment” variable to something more publication-ready using the label argument. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) #bkckwmzsjh table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #bkckwmzsjh thead, #bkckwmzsjh tbody, #bkckwmzsjh tfoot, #bkckwmzsjh tr, #bkckwmzsjh td, #bkckwmzsjh th { border-style: none; } #bkckwmzsjh p { margin: 0; padding: 0; } #bkckwmzsjh .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #bkckwmzsjh .gt_caption { padding-top: 4px; padding-bottom: 4px; } #bkckwmzsjh .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #bkckwmzsjh .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #bkckwmzsjh .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #bkckwmzsjh .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #bkckwmzsjh .gt_column_spanner_outer:first-child { padding-left: 0; } #bkckwmzsjh .gt_column_spanner_outer:last-child { padding-right: 0; } #bkckwmzsjh .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #bkckwmzsjh .gt_spanner_row { border-bottom-style: hidden; } #bkckwmzsjh .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #bkckwmzsjh .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #bkckwmzsjh .gt_from_md > :first-child { margin-top: 0; } #bkckwmzsjh .gt_from_md > :last-child { margin-bottom: 0; } #bkckwmzsjh .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #bkckwmzsjh .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #bkckwmzsjh .gt_row_group_first td { border-top-width: 2px; } #bkckwmzsjh .gt_row_group_first th { border-top-width: 2px; } #bkckwmzsjh .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #bkckwmzsjh .gt_first_summary_row.thick { border-top-width: 2px; } #bkckwmzsjh .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #bkckwmzsjh .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #bkckwmzsjh .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #bkckwmzsjh .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #bkckwmzsjh .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #bkckwmzsjh .gt_left { text-align: left; } #bkckwmzsjh .gt_center { text-align: center; } #bkckwmzsjh .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #bkckwmzsjh .gt_font_normal { font-weight: normal; } #bkckwmzsjh .gt_font_bold { font-weight: bold; } #bkckwmzsjh .gt_font_italic { font-style: italic; } #bkckwmzsjh .gt_super { font-size: 65%; } #bkckwmzsjh .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #bkckwmzsjh .gt_asterisk { font-size: 100%; vertical-align: 0; } #bkckwmzsjh .gt_indent_1 { text-indent: 5px; } #bkckwmzsjh .gt_indent_2 { text-indent: 10px; } #bkckwmzsjh .gt_indent_3 { text-indent: 15px; } #bkckwmzsjh .gt_indent_4 { text-indent: 20px; } #bkckwmzsjh .gt_indent_5 { text-indent: 25px; } Characteristic N = 231,592,6931 Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) 1 % (SE(%)) To remove the phrase “Characteristic” and the estimated population size, we can modify the header using the function modify_header() to update the label and stat_0. To add footnotes and a title, we can first convert the object to a gt table using as_gt() and then use the same functions we did previously. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) %&gt;% modify_footnote(update = everything() ~ NA) %&gt;% modify_header( label = &quot; &quot;, stat_0 = &quot;% (s.e.)&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #rwokkyhyau table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #rwokkyhyau thead, #rwokkyhyau tbody, #rwokkyhyau tfoot, #rwokkyhyau tr, #rwokkyhyau td, #rwokkyhyau th { border-style: none; } #rwokkyhyau p { margin: 0; padding: 0; } #rwokkyhyau .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #rwokkyhyau .gt_caption { padding-top: 4px; padding-bottom: 4px; } #rwokkyhyau .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #rwokkyhyau .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #rwokkyhyau .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #rwokkyhyau .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #rwokkyhyau .gt_column_spanner_outer:first-child { padding-left: 0; } #rwokkyhyau .gt_column_spanner_outer:last-child { padding-right: 0; } #rwokkyhyau .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #rwokkyhyau .gt_spanner_row { border-bottom-style: hidden; } #rwokkyhyau .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #rwokkyhyau .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #rwokkyhyau .gt_from_md > :first-child { margin-top: 0; } #rwokkyhyau .gt_from_md > :last-child { margin-bottom: 0; } #rwokkyhyau .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #rwokkyhyau .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #rwokkyhyau .gt_row_group_first td { border-top-width: 2px; } #rwokkyhyau .gt_row_group_first th { border-top-width: 2px; } #rwokkyhyau .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #rwokkyhyau .gt_first_summary_row.thick { border-top-width: 2px; } #rwokkyhyau .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #rwokkyhyau .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #rwokkyhyau .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #rwokkyhyau .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #rwokkyhyau .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #rwokkyhyau .gt_left { text-align: left; } #rwokkyhyau .gt_center { text-align: center; } #rwokkyhyau .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #rwokkyhyau .gt_font_normal { font-weight: normal; } #rwokkyhyau .gt_font_bold { font-weight: bold; } #rwokkyhyau .gt_font_italic { font-style: italic; } #rwokkyhyau .gt_super { font-size: 65%; } #rwokkyhyau .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #rwokkyhyau .gt_asterisk { font-size: 100%; vertical-align: 0; } #rwokkyhyau .gt_indent_1 { text-indent: 5px; } #rwokkyhyau .gt_indent_2 { text-indent: 10px; } #rwokkyhyau .gt_indent_3 { text-indent: 15px; } #rwokkyhyau .gt_indent_4 { text-indent: 20px; } #rwokkyhyau .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 % (s.e.) Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? Continuous variables can also be added and we add a summary of the age variable to the table below by updating the include, statistic, and digits argument. Adding on additional variables is a large benefit to the {gtsummary} package. anes_des %&gt;% tbl_svysummary( include=c(TrustGovernment, Age), statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;, all_continuous() ~ &quot;{mean} ({mean.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent, Age ~ c(1, 2)), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;) ) %&gt;% modify_header( label = &quot; &quot;, stat_0 = &quot;Summary&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government, 2020&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) #qkclffrjqx table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qkclffrjqx thead, #qkclffrjqx tbody, #qkclffrjqx tfoot, #qkclffrjqx tr, #qkclffrjqx td, #qkclffrjqx th { border-style: none; } #qkclffrjqx p { margin: 0; padding: 0; } #qkclffrjqx .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qkclffrjqx .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qkclffrjqx .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qkclffrjqx .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qkclffrjqx .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qkclffrjqx .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qkclffrjqx .gt_column_spanner_outer:first-child { padding-left: 0; } #qkclffrjqx .gt_column_spanner_outer:last-child { padding-right: 0; } #qkclffrjqx .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qkclffrjqx .gt_spanner_row { border-bottom-style: hidden; } #qkclffrjqx .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qkclffrjqx .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qkclffrjqx .gt_from_md > :first-child { margin-top: 0; } #qkclffrjqx .gt_from_md > :last-child { margin-bottom: 0; } #qkclffrjqx .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qkclffrjqx .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qkclffrjqx .gt_row_group_first td { border-top-width: 2px; } #qkclffrjqx .gt_row_group_first th { border-top-width: 2px; } #qkclffrjqx .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qkclffrjqx .gt_first_summary_row.thick { border-top-width: 2px; } #qkclffrjqx .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qkclffrjqx .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qkclffrjqx .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qkclffrjqx .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qkclffrjqx .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qkclffrjqx .gt_left { text-align: left; } #qkclffrjqx .gt_center { text-align: center; } #qkclffrjqx .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qkclffrjqx .gt_font_normal { font-weight: normal; } #qkclffrjqx .gt_font_bold { font-weight: bold; } #qkclffrjqx .gt_font_italic { font-style: italic; } #qkclffrjqx .gt_super { font-size: 65%; } #qkclffrjqx .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qkclffrjqx .gt_asterisk { font-size: 100%; vertical-align: 0; } #qkclffrjqx .gt_indent_1 { text-indent: 5px; } #qkclffrjqx .gt_indent_2 { text-indent: 10px; } #qkclffrjqx .gt_indent_3 { text-indent: 15px; } #qkclffrjqx .gt_indent_4 { text-indent: 20px; } #qkclffrjqx .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government, 2020 Summary1 Trust in Government, 2020     Always 1.6 (0.2)     Most of the time 13 (0.6)     About half the time 31 (0.8)     Some of the time 43 (0.9)     Never 11 (0.6) Age 47.3 (0.36) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 1 % (SE(%)); Mean (SE) The {gtsummary} also allows calculating statistics by different groups easily. Let’s adapt the prior example to perform analysis by whether the person voted for president in 2020. The argument for by is updated and the header names are updated. Finally, we update the header. anes_des %&gt;% tbl_svysummary( include=TrustGovernment, statistic = list(all_categorical() ~ &quot;{p} ({p.std.error})&quot;), missing=&quot;no&quot;, digits = list(TrustGovernment ~ style_percent), label=list(TrustGovernment ~ &quot;Trust in Government, 2020&quot;), by = VotedPres2020 ) %&gt;% modify_header( label = &quot; &quot;, stat_1 = &quot;Voted&quot;, stat_2 = &quot;Didn&#39;t vote&quot; ) %&gt;% as_gt() %&gt;% tab_header(&quot;American voter&#39;s trust in the federal government by whether they voted in the 2020 presidential election&quot;) %&gt;% tab_source_note(&quot;American National Election Studies, 2020&quot;) %&gt;% tab_footnote(&quot;Question text: How often can you trust the federal government in Washington to do what is right?&quot;) ## 7 observations missing `VotedPres2020` have been removed. To include these observations, use `forcats::fct_na_value_to_level()` on `VotedPres2020` column before passing to `tbl_svysummary()`. #qossclpplr table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qossclpplr thead, #qossclpplr tbody, #qossclpplr tfoot, #qossclpplr tr, #qossclpplr td, #qossclpplr th { border-style: none; } #qossclpplr p { margin: 0; padding: 0; } #qossclpplr .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qossclpplr .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qossclpplr .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qossclpplr .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qossclpplr .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qossclpplr .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qossclpplr .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qossclpplr .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qossclpplr .gt_column_spanner_outer:first-child { padding-left: 0; } #qossclpplr .gt_column_spanner_outer:last-child { padding-right: 0; } #qossclpplr .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qossclpplr .gt_spanner_row { border-bottom-style: hidden; } #qossclpplr .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qossclpplr .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qossclpplr .gt_from_md > :first-child { margin-top: 0; } #qossclpplr .gt_from_md > :last-child { margin-bottom: 0; } #qossclpplr .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qossclpplr .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qossclpplr .gt_row_group_first td { border-top-width: 2px; } #qossclpplr .gt_row_group_first th { border-top-width: 2px; } #qossclpplr .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qossclpplr .gt_first_summary_row.thick { border-top-width: 2px; } #qossclpplr .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qossclpplr .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qossclpplr .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qossclpplr .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qossclpplr .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qossclpplr .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qossclpplr .gt_left { text-align: left; } #qossclpplr .gt_center { text-align: center; } #qossclpplr .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qossclpplr .gt_font_normal { font-weight: normal; } #qossclpplr .gt_font_bold { font-weight: bold; } #qossclpplr .gt_font_italic { font-style: italic; } #qossclpplr .gt_super { font-size: 65%; } #qossclpplr .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qossclpplr .gt_asterisk { font-size: 100%; vertical-align: 0; } #qossclpplr .gt_indent_1 { text-indent: 5px; } #qossclpplr .gt_indent_2 { text-indent: 10px; } #qossclpplr .gt_indent_3 { text-indent: 15px; } #qossclpplr .gt_indent_4 { text-indent: 20px; } #qossclpplr .gt_indent_5 { text-indent: 25px; } American voter's trust in the federal government by whether they voted in the 2020 presidential election Voted1 Didn’t vote1 Trust in Government, 2020     Always 1.1 (0.2) 3.2 (0.7)     Most of the time 14 (0.6) 12 (1.4)     About half the time 32 (0.8) 29 (1.8)     Some of the time 45 (0.9) 39 (1.9)     Never 9.0 (0.6) 18 (1.7) American National Election Studies, 2020 Question text: How often can you trust the federal government in Washington to do what is right? 1 % (SE(%)) 8.3.2 Charts and Plots Survey analysis can result in an abundance of printed summary statistics and models. Even with the best analysis, the results can be overwhelming and difficult to comprehend. This is where charts and plots play a key role in our work. By transforming complex data into visual representation, we can recognize patterns, relationships, and trends with greater ease. R has many packages for creating compelling and insightful charts. We will focus on {ggplot2}, a member of the {tidyverse}. This package is a powerful, flexible tool for creating a wide range of data visualization. {ggplot2} follows the “grammar of graphics,” a framework that incrementally adds layers of chart components. We can customize visual elements such as scales, colors, labels, and annotations to enhance the understanding of data. After creating the design object that we’ve been using previously, we select our desired data points by modifying the existing design to add other outcomes and calculate estimates. Below, we create a binary variable TrustGovernmentUsually which is TRUE when TrustGovernment is “Always” or “Most of the time” and FALSE otherwise. Then, we calculate the percentage of people who usually trust in the government by who they voted for in the 2020 presidential election (VotedPres2020_selection). We remove the cases where people did not vote or did not indicate for whom they voted. anes_des_der &lt;- anes_des %&gt;% mutate(TrustGovernmentUsually = case_when( is.na(TrustGovernment) ~ NA, TRUE ~ TrustGovernment %in% c(&quot;Always&quot;, &quot;Most of the time&quot;) )) %&gt;% group_by(VotedPres2020_selection) %&gt;% summarise( pct_trust = survey_mean( TrustGovernmentUsually, na.rm = TRUE, proportion = TRUE, vartype = &quot;ci&quot; ), .groups = &quot;drop&quot; ) %&gt;% drop_na(VotedPres2020_selection) anes_des_der ## # A tibble: 3 × 4 ## VotedPres2020_selection pct_trust pct_trust_low pct_trust_upp ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Biden 0.124 0.108 0.141 ## 2 Trump 0.173 0.155 0.193 ## 3 Other 0.0614 0.0249 0.144 Now, we can begin creating our chart with {ggplot2}. First, we set up our plot with ggplot(). Next, we state the data points we want to show with aes. Finally, we specify the type of plot with geom_*(), in this case, geom_bar(). p &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust)) + geom_bar(stat = &quot;identity&quot;) p FIGURE 8.1: Bar chart of trust in government by chosen 2020 presidential candidate ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10 and 0.15. ## The chart is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12. ## Bar 2 is centered at 2, and length is from 0 to 0.17. ## Bar 3 is centered at 3, and length is from 0 to 0.06. This is a great starting-off point: we can see that the percentage of people saying they always usually trust the government is higher for those who voted for Trump than Biden or other candidates. What if we wanted to add color to better differentiate the three groups? We can add fill under aesthetics, denoting to use those data points to fill in the bars. pcolor &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) pcolor FIGURE 8.2: Bar chart of trust in government by chosen 2020 presidential candidate with colors ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10 and 0.15. ## There is a legend indicating fill is used to show VotedPres2020_selection, with 3 levels: ## Biden shown as strong reddish orange fill, ## Trump shown as vivid yellowish green fill and ## Other shown as brilliant blue fill. ## The chart is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour strong reddish orange which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour vivid yellowish green which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour brilliant blue which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. Let’s say we wanted to follow good statistical analysis practice and include our ranges on our plot. We can add another geom, geom_errorbar(), on top of our geom_bar() layer using a plus sign +. pcol_error &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) pcol_error FIGURE 8.3: Bar chart of trust in government by chosen 2020 presidential candidate with colors and error bars ## This is an untitled chart with no subtitle or caption. ## It has x-axis &#39;VotedPres2020_selection&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;pct_trust&#39; with labels 0.00, 0.05, 0.10, 0.15 and 0.20. ## There is a legend indicating fill is used to show VotedPres2020_selection, with 3 levels: ## Biden shown as strong reddish orange fill, ## Trump shown as vivid yellowish green fill and ## Other shown as brilliant blue fill. ## It has 2 layers. ## Layer 1 is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour strong reddish orange which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour vivid yellowish green which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour brilliant blue which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. ## Layer 2 is an errorbar graph that VI cannot process. ## Layer 2 has width set to 0.2. We can continue adding to our plot until we achieve the visualization we’d like to present. pfull &lt;- anes_des_der %&gt;% ggplot(aes(x = VotedPres2020_selection, y = pct_trust, fill = VotedPres2020_selection)) + geom_bar(stat = &quot;identity&quot;) + geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), width = .2) + scale_fill_manual(values = c(&quot;#0b3954&quot;, &quot;#bfd7ea&quot;, &quot;#8d6b94&quot;)) + xlab(&quot;Election choice (2020)&quot;) + ylab(&quot;Usually trust the government&quot;) + scale_y_continuous(labels = scales::percent) + guides(fill = &quot;none&quot;) + labs(title = &quot;Percent of voters who usually trust the government by chosen 2020 presidential candidate&quot;, caption = &quot;Source: American National Election Studies, 2020&quot;) pfull FIGURE 8.4: Bar chart of trust in government by chosen 2020 presidential candidate with colors, labels, error bars, and title ## This chart has title &#39;Percent of voters who usually trust the government by chosen 2020 presidential candidate&#39;. ## It has caption &#39;Source: American National Election Studies, 2020&#39;. ## It has x-axis &#39;Election choice (2020)&#39; with labels Biden, Trump and Other. ## It has y-axis &#39;Usually trust the government&#39; with labels 0%, 5%, 10%, 15% and 20%. ## In this chart fill is used to show VotedPres2020_selection. The legend that would normally indicate this has been hidden. ## It has 2 layers. ## Layer 1 is a bar chart with 3 vertical bars. ## Bar 1 is centered at 1, and length is from 0 to 0.12 with fill colour dark blue which maps to VotedPres2020_selection = Biden. ## Bar 2 is centered at 2, and length is from 0 to 0.17 with fill colour very pale blue which maps to VotedPres2020_selection = Trump. ## Bar 3 is centered at 3, and length is from 0 to 0.06 with fill colour moderate purple which maps to VotedPres2020_selection = Other. ## These are stacked, as sorted by VotedPres2020_selection. ## Layer 2 is an errorbar graph that VI cannot process. ## Layer 2 has width set to 0.2. 8.4 Reproducibility Reproducibility is the ability to recreate or replicate the results of a data analysis. If we pass an analysis project to another person, they should be able to run the entire project from start to finish and obtain the same results. Reproducibility is a crucial aspect of survey research because it enables the verification of findings and ensures that the conclusions are not dependent on a particular person running the workflow. Others can review and rerun projects to build on existing work, reducing redundancy and errors. Reproducibility requires that we consider several key components: Code: The source code used for data cleaning, analysis, modeling, and reporting must be available, discoverable, documented, and shared. Data: The raw data used in the workflow must be available, discoverable, documented, and shared. If the raw data is sensitive or proprietary, we must be able to provide the data that would allow others to run our workflow. Environment: The environment of the project must be documented. Another analyst should be able to recreate the environment, including the R version, packages, operating system, and other dependencies used in the analysis. Methodology: The analysis methodology, including the rationale behind specific decisions, interpretations, and assumptions, must be documented. Others should be able to achieve the same analysis results based on the methodology report. Many tools, practices, and project management techniques exist to make survey analysis projects easy to reproduce. For best results, they should be decided upon and applied at the beginning of a project. Below are our suggestions for a survey analysis data workflow. This list is not comprehensive but aims to provide a starting off point for teams looking to create a reproducible workflow. 8.4.1 Setting Random Number Seeds Some tasks in survey analysis require randomness such as imputation, model training, or creating random samples. By default, the random numbers generated by R will change each time we rerun the code, making it difficult to reproduce the same results. By “setting the seed,” we can control the randomness and ensure that the random numbers remain consistent whenever we rerun the code. Others can use the same seed value to reproduce our random numbers and achieve the same results, facilitating reproducibility. In R, we can use the set.seed() function to control the randomness in our code. Set a seed value by providing an integer to the function: set.seed(999) runif(5) The runif() function generates five random numbers from a uniform distribution. Since the seed is set to 999, running runif() multiple times will always produce the same sequence: [1] 0.38907138 0.58306072 0.09466569 0.85263123 0.78674676 It is important to note that set.seed() should be used before any random number generation. 8.4.2 Git A survey analysis project produces a lot of code. As code evolves throughout a project, keeping track of the latest version becomes challenging. If a team of analysts is working on the same script, someone may use an outdated version, resulting in incorrect results or duplicative work. Version control systems like Git can help alleviate these pains. Git is a system that helps track changes in computer files. Survey analysis can use Git to follow the evolution of code and manage asynchronous work. With Git, it is easy to see any changes made in a script, revert changes, and resolve conflicts between versions. Services such as GitHub or GitLab provide hosting and sharing of files as well as version control with Git. For example, we can visit the GitHub repository for this book (https://github.com/tidy-survey-r/tidy-survey-book) and see the files that build the book, when they were committed to the repository, and the history of modifications over time. In addition to code scripts, platforms like GitHub can store data and documentation. They provide a way to maintain a history of data modifications through versioning and timestamps. By saving the data and documentation alongside the code, it becomes easier for others to refer to and access everything they need in one place. Using version control in data science projects makes collaboration and maintenance more manageable. One excellent resource is Happy Git and GitHub for the R useR by Jenny Bryan and Jim Hester. 8.4.3 {renv} The {renv} package is a popular option for managing dependencies and creating virtual environments in R. It creates isolated, project-specific environments that record the packages and their versions used in the code. When initiated, {renv} checks whether the installed packages are consistent with the record. If not, it restores the correct versions for running the project. With {renv}, others can replicate the project’s environment to rerun the code and obtain consistent results. 8.4.4 Quarto/R Markdown Quarto and R Markdown are powerful tools that allow us to create documents that combine code and text. These documents present analysis results alongside the report’s narrative, so there’s no need to copy and paste code output into the final documentation. By eliminating manual steps, we can reduce the chances of errors in the final output. Rerunning a Quarto or R Markdown document automatically re-executes the underlying code. Another team member can recreate the report and obtain the same results. Parameterization Quarto and R Markdown’s parameterization is an important aspect of reproducibility in reporting. Parameters can control various aspects of the analysis, such as dates, geography, or other analysis variables. By parameterizing our code, we can define and modify these parameters to explore different scenarios or inputs. For example, we can create a document that provides survey analysis results for Michigan. By defining a state parameter, we can rerun the same analysis for Wisconsin without having to edit the code throughout the document. We can define parameterization in the header or code chunks of our Quarto/R Markdown documents. Again, we can easily modify and document the values of these parameters, reducing errors that may occur by manually editing code throughout the script. Parameterization is also a flexible way for others to replicate the analysis and explore variations. 8.4.5 The {targets} package The {targets} package is a workflow manager enabling us to document, automate, and execute complex data workflows with multiple steps and dependencies. We define the order of execution for our code. Only the affected code and its downstream targets are re-executed when we change a script. The {targets} package also provides interactive progress monitoring and reporting, allowing us to track the status and progress of our analysis pipeline. This tool helps with reproducibility by tracking dependencies, inputs, and outputs of each step of our workflow. As noted above, many tools, practices, and project management techniques exist for achieving reproducibility. Most critical is deciding on reproducibility goals with our team and the requirements to achieve them before deciding on workflow and documentation. "],["c09-ncvs-vignette.html", "Chapter 9 National Crime Victimization Survey Vignette 9.1 Introduction 9.2 Data structure 9.3 Survey notation 9.4 Data file preparation 9.5 Survey design objects 9.6 Calculating estimates 9.7 Exercises", " Chapter 9 National Crime Victimization Survey Vignette Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(srvyr) library(gt) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from NCVS. Here is the code to read in the three datasets that we will be working with: inc_in &lt;- read_osf(&quot;ncvs_2021_incident.rds&quot;) hh_in &lt;- read_osf(&quot;ncvs_2021_household.rds&quot;) pers_in &lt;- read_osf(&quot;ncvs_2021_person.rds&quot;) 9.1 Introduction The United States National Crime Victimization Survey (NCVS) is a household survey sponsored by the Bureau of Justice Statistics (BJS), which collects data on criminal victimization, including characteristics of the crimes, offenders, and victims. Both household and personal crimes include violent and non-violent crimes. The target population of this survey is all people in the United States age 12 and older living in housing units and noninstitutional group quarters. The NCVS has been ongoing since 1992. An earlier survey, the National Crime Survey, was run from 1972 to 1991 (Bureau of Justice Statistics (2017)). The survey is administered using a rotating panel. When an address enters the sample, the residents of that address are interviewed every six months for a total of seven interviews. If the initial residents move away from the address during the period, the new residents are included in the survey, and people are not followed when they move. NCVS data is publicly available and distributed by Inter-university Consortium for Political and Social Research (ICPSR)27, with data going back to 1992. The vignette in this book will include data from 2021 (United States. Bureau of Justice Statistics (2022)). The NCVS data structure is complicated, and the User’s Guide contains examples for analysis in SAS, SUDAAN, SPSS, and Stata, but not R (Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (2015)). This vignette will adapt those examples for R. 9.2 Data structure The data from ICPSR is distributed with five files, each having its unique identifier indicated: Address Record - YEARQ, IDHH Household Record - YEARQ, IDHH Person Record - YEARQ, IDHH, IDPER Incident Record - YEARQ, IDHH, IDPER 2021 Collection Year Incident - YEARQ, IDHH, IDPER We will focus on the household, person, and incident files. From these files, we selected a subset of columns for examples to use in this vignette. Download the complete files at ICPSR28. 9.3 Survey notation The NCVS User Guide (Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (2015)) uses the following notation: \\(i\\) represents NCVVS households, identified on the household-level file on the basis of the household identification number IDHH. \\(j\\) represents NCVS individual respondents within households \\(i\\), identified on the person-level file on the basis of the person identification number IDPER. \\(k\\) represents reporting periods (i.e., YEARQ) for households \\(i\\) and individual respondent \\(j\\). \\(l\\) represents victimization records for respondent \\(j\\) in household \\(i\\) and reporting period \\(k\\). Each record on the NCVS incident-level file is associated with a victimization record \\(l\\). \\(D\\) represents one or more domain characteristics of interest in the calculation of NCVS estimates. For victimization totals and proportions, domains can be defined on the basis of crime types (e.g., violent crimes, property crimes), characteristics of victims (e.g., age, sex, household income), or characteristics of the victimizations (e.g., victimizations reported to police, victimizations committed with a weapon present). Domains could also be a combination of all of these types of characteristics. For the calculation of victimization rates, domains are defined on the basis of the characteristics of the victims. \\(A_a\\) represents the level \\(a\\) of covariate \\(A\\). Covariate \\(A\\) is defined in the calculation of victimization proportions and represents the characteristic for which the analyst wants to obtain the distribution of victimizations in domain \\(D\\). \\(C\\) represents the personal or property crime for which we want to obtain a victimization rate. In this vignette, we will discuss four estimates: Victimization totals estimate the number of criminal victimizations with a given characteristic. As demonstrated below, these can be calculated from any of the design objects. The estimated victimization total, \\(\\hat{t}_D\\) for domain \\(D\\) is estimated as \\[ \\hat{t}_D = \\sum_{ijkl \\in D} v_{ijkl}\\] where \\(v_{ijkl}\\) is the series-adjusted victimization weight for household \\(i\\), respondent \\(j\\), reporting period \\(k\\), and victimization \\(l\\), that is WGTVICCY. Victimization proportions estimate characteristics among victimizations or victims. Victimization proportions are calculated using the incident design object. The estimated victimization proportion for domain \\(D\\) across level \\(a\\) of covariate \\(A\\), \\(\\hat{p}_{A_a,D}\\) is \\[ \\hat{p}_{A_a,D} =\\frac{\\sum_{ijkl \\in A_a, D} v_{ijkl}}{\\sum_{ijkl \\in D} v_{ijkl}}.\\] The numerator is the number of incidents with a particular characteristic in a domain, and the denominator is the number of incidents in a domain. Victimization rates are estimates of the number of victimizations per 1,000 persons or households in the population29. Victimization rates are calculated using the household or person design objects. The estimated victimization rate for crime \\(C\\) in domain \\(D\\) is \\[V\\hat{R}_{C,D}= \\frac{\\sum_{ijkl \\in C,D} v_{ijkl}}{\\sum_{ijk \\in D} w_{ijk}}\\times 1000\\] where \\(w_{ijk}\\) is the person weight (WGTPERCY) or household weight (WGTHHCY) for personal and household crimes, respectively. The numerator is the number of incidents in a domain, and the denominator is the number of persons or households in a domain. Prevalence rates are estimates of the percentage of the population (persons or households) who are victims of a crime. These are estimated using the household or person design objects. The estimated prevalence rate for crime \\(C\\) in domain \\(D\\) is \\[ P\\hat{R}_{C, D}= \\frac{\\sum_{ijk \\in {C,D}} I_{ij}w_{ijk}}{\\sum_{ijk \\in D} w_{ijk}} \\times 100\\] where \\(I_{ij}\\) is an indicator that a person or household in domain \\(D\\) was a victim of crime \\(C\\) at any time in the year. The numerator is the number of people who are victims in domain \\(D\\) for crime \\(C\\), and the denominator is the number of people or households in the population. 9.4 Data file preparation Some work is necessary to prepare the files before analysis. The design variables indicating pseudostratum (V2117) and half-sample code (V2118) are only included on the household file, so they must be added to the person and incident files for any analysis. For victimization rates, we need to know the victimization status for both victims and non-victims. The incident file must be summarized and merged onto the household or person files for household-level and person-level crimes, respectively. We begin this vignette by discussing how to create these incident summary files. This is following Section 2.2 of the NCVS User’s Guide. 9.4.1 Preparing files for estimation of victimization rates Each record on the incident file represents one victimization, which is not the same as one incident. Some victimizations have several instances that make it difficult for the victim to differentiate the details of these incidents, labeled as “series crimes”. Appendix A of the User’s Guide indicates how to calculate the series weight in other statistical languages. Here, we adapt that code adapted for R. Essentially, if a victimization is a series crime, its series weight is top-coded at 10 based on the number of actual victimizations. If an incident is a series crime, but the number of occurrences is unknown, the series weight is set to 6. A description of the variables used in this chunk is included in the following table: Description Value Label V4016 HOW MANY TIMES INCIDENT OCCUR LAST 6 MOS 1-996 Number of times 997 Don’t know V4017 HOW MANY INCIDENTS 1 1-5 incidents (not a “series”) 2 6 or more incidents V4018 INCIDENTS SIMILAR IN DETAIL 1 Similar 2 Different (not in a “series”) V4019 ENOUGH DETAIL TO DISTINGUISH INCIDENTS 1 Yes (not a “series”) 2 No (is a “series”) WGTVICCY ADJUSTED VICTIMIZATION WEIGHT Numeric inc_series &lt;- inc_in %&gt;% mutate( series = case_when( V4017 %in% c(1, 8) ~ 1, V4018 %in% c(2, 8) ~ 1, V4019 %in% c(1, 8) ~ 1, TRUE ~ 2 # series ), n10v4016 = case_when(V4016 %in% c(997, 998) ~ NA_real_, V4016 &gt; 10 ~ 10, TRUE ~ V4016), serieswgt = case_when(series == 2 &amp; is.na(n10v4016) ~ 6, series == 2 ~ n10v4016, TRUE ~ 1), NEWWGT = WGTVICCY * serieswgt ) The next step in preparing the files for estimation is to create indicators on the victimization file for characteristics of interest. Almost all BJS publications limit the analysis to records where the victimization occurred in the United States, where V4022 is not equal to 1, and we will do this for all estimates as well. In the following example, we will create the following indicators: Property crime V4529 &gt;= 31 Variable: Property Violent crime V4529 &lt;= 20 Variable: Violent Property crime reported to the police V4529 &gt;= 31 and V4399=1 Variable: Property_ReportPolice Violent crime reported to the police V4529 &lt; 31 and V4399=1 Variable: Violent_ReportPolice Aggravated assault without a weapon V4529 in 11:12 and V4049=2 Variable: AAST_NoWeap Aggravated assault with a firearm V4529 in 11:12 and V4049=1 and (V4051=1 or V4052=1 or V4050=7) Variable: AAST_Firearm Aggravated assault with a knife or sharp object V4529 in 11:12 and V4049=1 and (V4053=1 or V4054=1) Variable: AAST_Knife Aggravated assault with another type of weapon V4529 in 11:12 and V4049=1 and V4050=1 and not firearm or knife Variable: AAST_Other A brief codebook for this section is as follows: Variable Description Value Label V4022 IN WHAT CITY, TOWN, VILLAGE 1 Outside U.S. 2 Not inside a city/town/village 3 Same city/town/village as present residence 4 Different city/town/village as present residence 5 Don’t know 6 Don’t know if 2, 4, or 5 V4049 DID OFFENDER HAVE A WEAPON 1 Yes 2 No 3 Don’t know V4050 WHAT WAS WEAPON 1 At least one good entry 3 Indicates “Yes-Type Weapon-NA” 7 Indicates “Gun Type Unknown” 8 No good entry V4051 HAND GUN 0 No 1 Yes V4052 OTHER GUN 0 No 1 Yes V4053 KNIFE 0 No 1 Yes V4399 REPORTED TO POLICE 1 Yes 2 No 3 Don’t know V4529 TYPE OF CRIME CODE 01 Completed rape 02 Attempted rape 03 Sexual attack with serious assault 04 Sexual attack with minor assault 05 Completed robbery with injury from serious assault 06 Completed robbery with injury from minor assault 07 Completed robbery without injury from minor assault 08 Attempted robbery with injury from serious assault 09 Attempted robbery with injury from minor assault 10 Attempted robbery without injury 11 Completed aggravated assault with injury 12 Attempted aggravated assault with weapon 13 Threatened assault with weapon 14 Simple assault completed with injury 15 Sexual assault without injury 16 Unwanted sexual contact without force 17 Assault without weapon without injury 18 Verbal threat of rape 19 Verbal threat of sexual assault 20 Verbal threat of assault 21 Completed purse snatching 22 Attempted purse snatching 23 Pocket picking (completed only) 31 Completed burglary, forcible entry 32 Completed burglary, unlawful entry without force 33 Attempted forcible entry 40 Completed motor vehicle theft 41 Attempted motor vehicle theft 54 Completed theft less than $10 55 Completed theft $10 to $49 56 Completed theft $50 to $249 57 Completed theft $250 or greater 58 Completed theft value NA 59 Attempted theft Below, we check the derived variables. This is a good point to pause to look at the output of crosswalks between an original variable and a derived one to check the logic was programmed correctly, and everything ends up in the expected bucket. inc_ind &lt;- inc_series %&gt;% filter(V4022 != 1) %&gt;% mutate( WeapCat = case_when( is.na(V4049) ~ NA_character_, V4049 == 2 ~ &quot;NoWeap&quot;, V4049 == 3 ~ &quot;UnkWeapUse&quot;, V4050 == 3 ~ &quot;Other&quot;, V4051 == 1 | V4052 == 1 | V4050 == 7 ~ &quot;Firearm&quot;, V4053 == 1 | V4054 == 1 ~ &quot;Knife&quot;, TRUE ~ &quot;Other&quot; ), V4529_num = parse_number(as.character(V4529)), ReportPolice = V4399 == 1, Property = V4529_num &gt;= 31, Violent = V4529_num &lt;= 20, Property_ReportPolice = Property &amp; ReportPolice, Violent_ReportPolice = Violent &amp; ReportPolice, AAST = V4529_num %in% 11:13, AAST_NoWeap = AAST &amp; WeapCat == &quot;NoWeap&quot;, AAST_Firearm = AAST &amp; WeapCat == &quot;Firearm&quot;, AAST_Knife = AAST &amp; WeapCat == &quot;Knife&quot;, AAST_Other = AAST &amp; WeapCat == &quot;Other&quot; ) # Checking derived variables inc_series %&gt;% count(V4022) ## # A tibble: 6 × 2 ## V4022 n ## &lt;fct&gt; &lt;int&gt; ## 1 1 34 ## 2 2 65 ## 3 3 7697 ## 4 4 1143 ## 5 5 39 ## 6 8 4 inc_ind %&gt;% count(V4022) ## # A tibble: 5 × 2 ## V4022 n ## &lt;fct&gt; &lt;int&gt; ## 1 2 65 ## 2 3 7697 ## 3 4 1143 ## 4 5 39 ## 5 8 4 inc_ind %&gt;% count(WeapCat, V4049, V4050, V4051, V4052, V4052, V4053, V4054) ## # A tibble: 13 × 8 ## WeapCat V4049 V4050 V4051 V4052 V4053 V4054 n ## &lt;chr&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Firearm 1 1 0 1 0 0 15 ## 2 Firearm 1 1 0 1 1 1 1 ## 3 Firearm 1 1 1 0 0 0 125 ## 4 Firearm 1 1 1 0 1 0 2 ## 5 Firearm 1 1 1 1 0 0 3 ## 6 Firearm 1 7 0 0 0 0 3 ## 7 Knife 1 1 0 0 0 1 14 ## 8 Knife 1 1 0 0 1 0 71 ## 9 NoWeap 2 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 1794 ## 10 Other 1 1 0 0 0 0 147 ## 11 Other 1 3 0 0 0 0 26 ## 12 UnkWeapUse 3 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 519 ## 13 &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; &lt;NA&gt; 6228 inc_ind %&gt;% count(V4529, Property, Violent, AAST) %&gt;% print(n = 40) ## # A tibble: 34 × 5 ## V4529 Property Violent AAST n ## &lt;fct&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;int&gt; ## 1 1 FALSE TRUE FALSE 45 ## 2 2 FALSE TRUE FALSE 20 ## 3 3 FALSE TRUE FALSE 11 ## 4 4 FALSE TRUE FALSE 3 ## 5 5 FALSE TRUE FALSE 24 ## 6 6 FALSE TRUE FALSE 26 ## 7 7 FALSE TRUE FALSE 59 ## 8 8 FALSE TRUE FALSE 5 ## 9 9 FALSE TRUE FALSE 7 ## 10 10 FALSE TRUE FALSE 57 ## 11 11 FALSE TRUE TRUE 97 ## 12 12 FALSE TRUE TRUE 91 ## 13 13 FALSE TRUE TRUE 163 ## 14 14 FALSE TRUE FALSE 165 ## 15 15 FALSE TRUE FALSE 24 ## 16 16 FALSE TRUE FALSE 12 ## 17 17 FALSE TRUE FALSE 357 ## 18 18 FALSE TRUE FALSE 14 ## 19 19 FALSE TRUE FALSE 3 ## 20 20 FALSE TRUE FALSE 607 ## 21 21 FALSE FALSE FALSE 2 ## 22 22 FALSE FALSE FALSE 2 ## 23 23 FALSE FALSE FALSE 19 ## 24 31 TRUE FALSE FALSE 248 ## 25 32 TRUE FALSE FALSE 634 ## 26 33 TRUE FALSE FALSE 188 ## 27 40 TRUE FALSE FALSE 256 ## 28 41 TRUE FALSE FALSE 97 ## 29 54 TRUE FALSE FALSE 407 ## 30 55 TRUE FALSE FALSE 1006 ## 31 56 TRUE FALSE FALSE 1686 ## 32 57 TRUE FALSE FALSE 1420 ## 33 58 TRUE FALSE FALSE 798 ## 34 59 TRUE FALSE FALSE 395 inc_ind %&gt;% count(ReportPolice, V4399) ## # A tibble: 4 × 3 ## ReportPolice V4399 n ## &lt;lgl&gt; &lt;fct&gt; &lt;int&gt; ## 1 FALSE 2 5670 ## 2 FALSE 3 103 ## 3 FALSE 8 12 ## 4 TRUE 1 3163 inc_ind %&gt;% count(AAST, WeapCat, AAST_NoWeap, AAST_Firearm, AAST_Knife, AAST_Other) ## # A tibble: 11 × 7 ## AAST WeapCat AAST_NoWeap AAST_Firearm AAST_Knife AAST_Other n ## &lt;lgl&gt; &lt;chr&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;lgl&gt; &lt;int&gt; ## 1 FALSE Firearm FALSE FALSE FALSE FALSE 34 ## 2 FALSE Knife FALSE FALSE FALSE FALSE 23 ## 3 FALSE NoWeap FALSE FALSE FALSE FALSE 1769 ## 4 FALSE Other FALSE FALSE FALSE FALSE 27 ## 5 FALSE UnkWeapUse FALSE FALSE FALSE FALSE 516 ## 6 FALSE &lt;NA&gt; FALSE FALSE FALSE FALSE 6228 ## 7 TRUE Firearm FALSE TRUE FALSE FALSE 115 ## 8 TRUE Knife FALSE FALSE TRUE FALSE 62 ## 9 TRUE NoWeap TRUE FALSE FALSE FALSE 25 ## 10 TRUE Other FALSE FALSE FALSE TRUE 146 ## 11 TRUE UnkWeapUse FALSE FALSE FALSE FALSE 3 After creating indicators of victimization types and characteristics, the file is summarized, and crimes are summed across persons or households by YEARQ. Property crimes (i.e., crimes committed against households, such as household burglary or motor vehicle theft) are summed across households, and personal crimes (i.e., crimes committed against an individual, such as assault, robbery, and personal theft) are summed across persons. The indicators are summed using the serieswgt, and the variable WGTVICCY needs to be retained for later analysis. inc_hh_sums &lt;- inc_ind %&gt;% filter(V4529_num &gt; 23) %&gt;% # restrict to household crimes group_by(YEARQ, IDHH) %&gt;% summarize( WGTVICCY = WGTVICCY[1], across(starts_with(&quot;Property&quot;), ~ sum(. * serieswgt), .names = &quot;{.col}&quot;), .groups = &quot;drop&quot; ) inc_pers_sums &lt;- inc_ind %&gt;% filter(V4529_num &lt;= 23) %&gt;% # restrict to person crimes group_by(YEARQ, IDHH, IDPER) %&gt;% summarize(WGTVICCY = WGTVICCY[1], across(c( starts_with(&quot;Violent&quot;), starts_with(&quot;AAST&quot;) ), ~ sum(. * serieswgt), .names = &quot;{.col}&quot;), .groups = &quot;drop&quot;) Now, we merge the victimization summary files into the appropriate files. For any record on the household or person file that is not on the victimization file, the victimization counts are set to 0 after merging. In this step, we will also create the victimization adjustment factor. See 2.2.4 in the User’s Guide for details of why this adjustment is created. It is calculated as follows: \\[ A_{ijk}=\\frac{v_{ijk}}{w_{ijk}}\\] where \\(w_{ijk}\\) is the person weight (WGTPERCY) for personal crimes or the household weight (WGTHHCY) for household crimes, and \\(v_{ijk}\\) is the victimization weight (WGTVICCY) for household \\(i\\), respondent \\(j\\), in reporting period \\(k\\). The adjustment factor is set to 0 if no incidents are reported. # Set up a list of 0s for each crime type/characteristic to replace NA&#39;s hh_z_list &lt;- rep(0, ncol(inc_hh_sums) - 3) %&gt;% as.list() %&gt;% setNames(names(inc_hh_sums)[-(1:3)]) pers_z_list &lt;- rep(0, ncol(inc_pers_sums) - 4) %&gt;% as.list() %&gt;% setNames(names(inc_pers_sums)[-(1:4)]) hh_vsum &lt;- hh_in %&gt;% full_join(inc_hh_sums, by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;)) %&gt;% replace_na(hh_z_list) %&gt;% mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTHHCY)) pers_vsum &lt;- pers_in %&gt;% full_join(inc_pers_sums, by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;, &quot;IDPER&quot;)) %&gt;% replace_na(pers_z_list) %&gt;% mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTPERCY)) 9.4.2 Derived demographic variables A final step in file preparation for the household and person files is creating any derived variables on the household and person files, such as income categories or age categories, for subgroup analysis. We can do this step before or after merging the victimization counts. For the household file, we create categories for tenure (rental status), urbanicity, income, place size, and region. For the person file, we create categories for sex, race/Hispanic origin, age categories, and marital status. We also merge the household demographics to the person file as well as the design variables (V2117 and V2118). Brief codebook for household variables: Variable Description Value Label V2015 TENURE 1 Owned or being bought 2 Rented for cash 3 No cash rent SC214A HOUSEHOLD INCOME 01 Less than $5,000 02 $5,000 to $7,499 03 $7,500 to $9,999 04 $10,000 to $12,499 05 $12,500 to $14,999 06 $15,000 to $17,499 07 $17,500 to $19,999 08 $20,000 to $24,999 09 $25,000 to $29,999 10 $30,000 to $34,999 11 $35,000 to $39,999 12 $40,000 to $49,999 13 $50,000 to $74,999 14 $75,000 and over 15 $75,000 to $99,999 16 $100,000-$149,999 17 $150,000-$199,999 18 $200,000 or more V2126B PLACE SIZE CODE 00 Not in a place 13 Under 10,000 16 10,000-49,999 17 50,000-99,999 18 100,000-249,999 19 250,000-499,999 20 500,000-999,999 21 1,000,000-2,499,999 22 2,500,000-4,999,999 23 5,000,000 or more V2127B REGION 1 Northeast 2 Midwest 3 South 4 West V2143 URBANICITY 1 Urban 2 Suburban 3 Rural Brief codebook for person variables: Variable Description Value Label V3014 AGE 12 through 90 V3015 MARITAL STATUS (Current) 1 Married 2 Widowed 3 Divorced 4 Separated 5 Never married V3018 sex 1 Male 2 Female V3023A RACE 01 White only 02 Black only 03 American Indian, Alaska native only 04 Asian only 05 Hawaiian/Pacific Islander only 06 White-Black 07 White-American Indian 08 White-Asian 09 White-Hawaiian 10 Black-American Indian 11 Black-Asian 12 Black-Hawaiian/Pacific Islander 13 American Indian-Asian 14 Asian-Hawaiian/Pacific Islander 15 White-Black-American Indian 16 White-Black-Asian 17 White-American Indian-Asian 18 White-Asian-Hawaiian 19 2 or 3 races 20 4 or 5 races V3024 HISPANIC ORIGIN 1 Yes 2 No hh_vsum_der &lt;- hh_vsum %&gt;% mutate( Tenure = factor( case_when(V2015 == 1 ~ &quot;Owned&quot;, !is.na(V2015) ~ &quot;Rented&quot;), levels = c(&quot;Owned&quot;, &quot;Rented&quot;) ), Urbanicity = factor( case_when(V2143 == 1 ~ &quot;Urban&quot;, V2143 == 2 ~ &quot;Suburban&quot;, V2143 == 3 ~ &quot;Rural&quot;), levels = c(&quot;Urban&quot;, &quot;Suburban&quot;, &quot;Rural&quot;) ), SC214A_num = as.numeric(as.character(SC214A)), Income = case_when( SC214A_num &lt;= 8 ~ &quot;Less than $25,000&quot;, SC214A_num &lt;= 12 ~ &quot;$25,000-49,999&quot;, SC214A_num &lt;= 15 ~ &quot;$50,000-99,999&quot;, SC214A_num &lt;= 17 ~ &quot;$100,000-199,999&quot;, SC214A_num &lt;= 18 ~ &quot;$200,000 or more&quot; ), Income = fct_reorder(Income, SC214A_num, .na_rm = FALSE), PlaceSize = case_match( as.numeric(as.character(V2126B)), 0 ~ &quot;Not in a place&quot;, 13 ~ &quot;Under 10,000&quot;, 16 ~ &quot;10,000-49,999&quot;, 17 ~ &quot;50,000-99,999&quot;, 18 ~ &quot;100,000-249,999&quot;, 19 ~ &quot;250,000-499,999&quot;, 20 ~ &quot;500,000-999,999&quot;, c(21, 22, 23) ~ &quot;1,000,000 or more&quot; ), PlaceSize = fct_reorder(PlaceSize, as.numeric(V2126B)), Region = case_match( as.numeric(V2127B), 1 ~ &quot;Northeast&quot;, 2 ~ &quot;Midwest&quot;, 3 ~ &quot;South&quot;, 4 ~ &quot;West&quot; ), Region = fct_reorder(Region, as.numeric(V2127B)) ) hh_vsum_der %&gt;% count(Tenure, V2015) ## # A tibble: 4 × 3 ## Tenure V2015 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Owned 1 101944 ## 2 Rented 2 46269 ## 3 Rented 3 1925 ## 4 &lt;NA&gt; &lt;NA&gt; 106322 hh_vsum_der %&gt;% count(Urbanicity, V2143) ## # A tibble: 3 × 3 ## Urbanicity V2143 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Urban 1 26878 ## 2 Suburban 2 173491 ## 3 Rural 3 56091 hh_vsum_der %&gt;% count(Income, SC214A) ## # A tibble: 18 × 3 ## Income SC214A n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Less than $25,000 1 7841 ## 2 Less than $25,000 2 2626 ## 3 Less than $25,000 3 3949 ## 4 Less than $25,000 4 5546 ## 5 Less than $25,000 5 5445 ## 6 Less than $25,000 6 4821 ## 7 Less than $25,000 7 5038 ## 8 Less than $25,000 8 11887 ## 9 $25,000-49,999 9 11550 ## 10 $25,000-49,999 10 13689 ## 11 $25,000-49,999 11 13655 ## 12 $25,000-49,999 12 23282 ## 13 $50,000-99,999 13 44601 ## 14 $50,000-99,999 15 33353 ## 15 $100,000-199,999 16 34287 ## 16 $100,000-199,999 17 15317 ## 17 $200,000 or more 18 16892 ## 18 &lt;NA&gt; &lt;NA&gt; 2681 hh_vsum_der %&gt;% count(PlaceSize, V2126B) ## # A tibble: 10 × 3 ## PlaceSize V2126B n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Not in a place 0 69484 ## 2 Under 10,000 13 39873 ## 3 10,000-49,999 16 53002 ## 4 50,000-99,999 17 27205 ## 5 100,000-249,999 18 24461 ## 6 250,000-499,999 19 13111 ## 7 500,000-999,999 20 15194 ## 8 1,000,000 or more 21 6167 ## 9 1,000,000 or more 22 3857 ## 10 1,000,000 or more 23 4106 hh_vsum_der %&gt;% count(Region, V2127B) ## # A tibble: 4 × 3 ## Region V2127B n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Northeast 1 41585 ## 2 Midwest 2 74666 ## 3 South 3 87783 ## 4 West 4 52426 hh_vsum_slim &lt;- hh_vsum_der %&gt;% select(YEARQ:V2118, WGTVICCY:ADJINC_WT, Tenure, Urbanicity, Income, PlaceSize, Region) NHOPI &lt;- &quot;Native Hawaiian or Other Pacific Islander&quot; # made for usage later pers_vsum_der &lt;- pers_vsum %&gt;% mutate( Sex = factor(case_when(V3018 == 1 ~ &quot;Male&quot;, V3018 == 2 ~ &quot;Female&quot;)), RaceHispOrigin = factor( case_when( V3024 == 1 ~ &quot;Hispanic&quot;, V3023A == 1 ~ &quot;White&quot;, V3023A == 2 ~ &quot;Black&quot;, V3023A == 4 ~ &quot;Asian&quot;, V3023A == 5 ~ NHOPI, TRUE ~ &quot;Other&quot; ), levels = c(&quot;White&quot;, &quot;Black&quot;, &quot;Hispanic&quot;, &quot;Asian&quot;, NHOPI, &quot;Other&quot;) ), V3014_num = as.numeric(as.character(V3014)), AgeGroup = case_when( V3014_num &lt;= 17 ~ &quot;12-17&quot;, V3014_num &lt;= 24 ~ &quot;18-24&quot;, V3014_num &lt;= 34 ~ &quot;25-34&quot;, V3014_num &lt;= 49 ~ &quot;35-49&quot;, V3014_num &lt;= 64 ~ &quot;50-64&quot;, V3014_num &lt;= 90 ~ &quot;65 or older&quot;, ), AgeGroup = fct_reorder(AgeGroup, V3014_num), MaritalStatus = factor( case_when( V3015 == 1 ~ &quot;Married&quot;, V3015 == 2 ~ &quot;Widowed&quot;, V3015 == 3 ~ &quot;Divorced&quot;, V3015 == 4 ~ &quot;Separated&quot;, V3015 == 5 ~ &quot;Never married&quot; ), levels = c(&quot;Never married&quot;, &quot;Married&quot;, &quot;Widowed&quot;, &quot;Divorced&quot;, &quot;Separated&quot;) ) ) %&gt;% left_join(select(hh_vsum_slim, YEARQ, IDHH, V2117, V2118, Tenure:Region), by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;)) pers_vsum_der %&gt;% count(Sex, V3018) ## # A tibble: 2 × 3 ## Sex V3018 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Female 2 150956 ## 2 Male 1 140922 pers_vsum_der %&gt;% count(RaceHispOrigin, V3024) ## # A tibble: 11 × 3 ## RaceHispOrigin V3024 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 White 2 197292 ## 2 White 8 883 ## 3 Black 2 29947 ## 4 Black 8 120 ## 5 Hispanic 1 41450 ## 6 Asian 2 16015 ## 7 Asian 8 61 ## 8 Native Hawaiian or Other Pacific Islander 2 891 ## 9 Native Hawaiian or Other Pacific Islander 8 9 ## 10 Other 2 5161 ## 11 Other 8 49 pers_vsum_der %&gt;% filter(RaceHispOrigin != &quot;Hispanic&quot; | is.na(RaceHispOrigin)) %&gt;% count(RaceHispOrigin, V3023A) ## # A tibble: 20 × 3 ## RaceHispOrigin V3023A n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 White 1 198175 ## 2 Black 2 30067 ## 3 Asian 4 16076 ## 4 Native Hawaiian or Other Pacific Islander 5 900 ## 5 Other 3 1319 ## 6 Other 6 1217 ## 7 Other 7 1025 ## 8 Other 8 837 ## 9 Other 9 184 ## 10 Other 10 178 ## 11 Other 11 87 ## 12 Other 12 27 ## 13 Other 13 13 ## 14 Other 14 53 ## 15 Other 15 136 ## 16 Other 16 45 ## 17 Other 17 11 ## 18 Other 18 33 ## 19 Other 19 22 ## 20 Other 20 23 pers_vsum_der %&gt;% group_by(AgeGroup) %&gt;% summarize(minAge = min(V3014), maxAge = max(V3014), .groups = &quot;drop&quot;) ## # A tibble: 6 × 3 ## AgeGroup minAge maxAge ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 12-17 12 17 ## 2 18-24 18 24 ## 3 25-34 25 34 ## 4 35-49 35 49 ## 5 50-64 50 64 ## 6 65 or older 65 90 pers_vsum_der %&gt;% count(MaritalStatus, V3015) ## # A tibble: 6 × 3 ## MaritalStatus V3015 n ## &lt;fct&gt; &lt;fct&gt; &lt;int&gt; ## 1 Never married 5 90425 ## 2 Married 1 148131 ## 3 Widowed 2 17668 ## 4 Divorced 3 28596 ## 5 Separated 4 4524 ## 6 &lt;NA&gt; 8 2534 pers_vsum_slim &lt;- pers_vsum_der %&gt;% select(YEARQ:WGTPERCY, WGTVICCY:ADJINC_WT, Sex:Region) The tibbles hh_vsum_slim and pers_vsum_slim can now be used to create design objects and calculate crime rate estimates. To calculate estimates about types of crime, such as what percentage of violent crimes are reported to the police, we must use the incident file. The incident file is not guaranteed to have every pseudostratum and half-sample code, so dummy records are created to append before estimation. Finally, demographic variables are merged onto the incident tibble. dummy_records &lt;- hh_vsum_slim %&gt;% distinct(V2117, V2118) %&gt;% mutate(Dummy = 1, WGTVICCY = 1, NEWWGT = 1) inc_analysis &lt;- inc_ind %&gt;% mutate(Dummy = 0) %&gt;% left_join(select(pers_vsum_slim, YEARQ, IDHH, IDPER, Sex:Region), by = c(&quot;YEARQ&quot;, &quot;IDHH&quot;, &quot;IDPER&quot;)) %&gt;% bind_rows(dummy_records) %&gt;% select(YEARQ:IDPER, WGTVICCY, NEWWGT, V4529, WeapCat, ReportPolice, Property:Region) 9.5 Survey design objects All the data prep above is necessary to prepare the data for survey analysis. At this point, we can create the design objects and finally begin analysis. We will create three design objects for different types of analysis as they depend on which type of estimate we are creating. For the incident data, the weight of analysis is NEWWGT, which we constructed previously. The household and person data use WGTHHCY and WGTPERCY, respectively. For all analyses, V2117 is the strata variable, and V2118 is the cluster variable for analysis. inc_des &lt;- inc_analysis %&gt;% as_survey( weight = NEWWGT, strata = V2117, ids = V2118, nest = TRUE ) hh_des &lt;- hh_vsum_slim %&gt;% as_survey( weight = WGTHHCY, strata = V2117, ids = V2118, nest = TRUE ) pers_des &lt;- pers_vsum_slim %&gt;% as_survey( weight = WGTPERCY, strata = V2117, ids = V2118, nest = TRUE ) 9.6 Calculating estimates Now that we have prepared our data, we can calculate our estimates. As a reminder, those are: Victimization totals estimate the number of criminal victimizations with a given characteristic. Victimization proportions estimate characteristics among victimizations or victims. Victimization rates are estimates of the number of victimizations per 1,000 persons or households in the population. Prevalence rates are estimates of the percentage of the population (persons or households) who are victims of a crime. 9.6.1 Victimization totals There are two ways to calculate victimization totals. Using the incident design object is the most straightforward method, but the person and household design objects can be used as well if the adjustment factor is incorporated. In the example below, the total number of property and violent victimizations are first calculated using the incident file and then using the household and person design objects. The incident file is smaller, and thus, estimation is faster using that file, but the estimates will be the same as illustrated below: inc_des %&gt;% summarize( Property_Vzn = survey_total(Property, na.rm = TRUE), Violent_Vzn = survey_total(Violent, na.rm = TRUE) ) ## # A tibble: 1 × 4 ## Property_Vzn Property_Vzn_se Violent_Vzn Violent_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. 4598306. 198115. hh_des %&gt;% summarize(Property_Vzn = survey_total(Property * ADJINC_WT, na.rm = TRUE)) ## # A tibble: 1 × 2 ## Property_Vzn Property_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. pers_des %&gt;% summarize(Violent_Vzn = survey_total(Violent * ADJINC_WT, na.rm = TRUE)) ## # A tibble: 1 × 2 ## Violent_Vzn Violent_Vzn_se ## &lt;dbl&gt; &lt;dbl&gt; ## 1 4598306. 198115. 9.6.2 Victimization proportions Victimization proportions are proportions describing features of a victimization. The key here is that these are questions among victimizations, not among the population. These types of estimates can only be calculated using the incident design object. These include questions such as: What proportion of property victimizations are reported to the police? What proportion of violent victimizations are in urban areas? inc_des %&gt;% filter(Property) %&gt;% group_by(ReportPolice) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ## # A tibble: 2 × 3 ## ReportPolice Pct Pct_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 69.2 0.798 ## 2 TRUE 30.8 0.798 inc_des %&gt;% filter(Violent) %&gt;% group_by(Urbanicity) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ## # A tibble: 3 × 3 ## Urbanicity Pct Pct_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Urban 18.1 1.49 ## 2 Suburban 69.3 1.96 ## 3 Rural 12.7 1.45 9.6.3 Victimization rates Victimization rates measure the number of victimizations per population and are not an estimate of the proportion of households or persons who are victimized, which is a prevalence rate described in the next section. Victimization rates are estimated using the household or person design objects depending on the type of crime, and the adjustment factor must be incorporated. We return to the example of property and violent victimizations used in the example for victimization totals. In the following example, the property victimization totals are calculated as above, as well as the property victimization rate, using survey_mean(), and the population size using survey_total(). hh_des %&gt;% summarize( Property_Vzn = survey_total(Property * ADJINC_WT, na.rm = TRUE), Property_Rate = survey_mean(Property * ADJINC_WT * 1000, na.rm = TRUE), PopSize = survey_total(1, vartype = NULL) ) ## # A tibble: 1 × 5 ## Property_Vzn Property_Vzn_se Property_Rate Property_Rate_se PopSize ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 11682056. 263844. 90.3 1.95 129319232. Victimization rates can also be calculated for particular characteristics of the victimization. In the following example, the rate of aggravated assault with no weapon, with a firearm, with a knife, and with another weapon. pers_des %&gt;% summarize(across( starts_with(&quot;AAST_&quot;), ~ survey_mean(. * ADJINC_WT * 1000, na.rm = TRUE) )) ## # A tibble: 1 × 8 ## AAST_NoWeap AAST_NoWeap_se AAST_Firearm AAST_Firearm_se AAST_Knife ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.249 0.0595 0.860 0.101 0.455 ## # ℹ 3 more variables: AAST_Knife_se &lt;dbl&gt;, AAST_Other &lt;dbl&gt;, ## # AAST_Other_se &lt;dbl&gt; A common desire is to calculate victimization rates by several characteristics. For example, we may want to calculate the violent victimization rate and aggravated assault rate by sex, race/Hispanic origin, age group, marital status, and household income. This requires a group_by() statement for each categorization separately. Thus, we make a function to do this and then use map_df() from the {purrr} package to loop through the variables. Finally, the {gt} package is used to make a publishable table. pers_est_by &lt;- function(byvar) { pers_des %&gt;% rename(Level := { { byvar } }) %&gt;% filter(!is.na(Level)) %&gt;% group_by(Level) %&gt;% summarize( Violent = survey_mean(Violent * ADJINC_WT * 1000, na.rm = TRUE), AAST = survey_mean(AAST * ADJINC_WT * 1000, na.rm = TRUE) ) %&gt;% mutate( Variable = byvar, LevelNum = as.numeric(Level), Level = as.character(Level) ) %&gt;% select(Variable, Level, LevelNum, everything()) } pers_est_df &lt;- c(&quot;Sex&quot;, &quot;RaceHispOrigin&quot;, &quot;AgeGroup&quot;, &quot;MaritalStatus&quot;, &quot;Income&quot;) %&gt;% map_df(pers_est_by) pers_est_df %&gt;% mutate( Variable = case_when( Variable == &quot;RaceHispOrigin&quot; ~ &quot;Race/Hispanic origin&quot;, Variable == &quot;MaritalStatus&quot; ~ &quot;Marital status&quot;, Variable == &quot;AgeGroup&quot; ~ &quot;Age&quot;, TRUE ~ Variable ) ) %&gt;% select(-LevelNum) %&gt;% group_by(Variable) %&gt;% gt(rowname_col = &quot;Level&quot;) %&gt;% tab_spanner( label = &quot;Violent crime&quot;, id = &quot;viol_span&quot;, columns = c(&quot;Violent&quot;, &quot;Violent_se&quot;) ) %&gt;% tab_spanner(label = &quot;Aggravated assault&quot;, columns = c(&quot;AAST&quot;, &quot;AAST_se&quot;)) %&gt;% cols_label( Violent = &quot;Rate&quot;, Violent_se = &quot;SE&quot;, AAST = &quot;Rate&quot;, AAST_se = &quot;SE&quot;, ) %&gt;% fmt_number( columns = c(&quot;Violent&quot;, &quot;Violent_se&quot;, &quot;AAST&quot;, &quot;AAST_se&quot;), decimals = 1 ) %&gt;% tab_footnote( footnote = &quot;Includes rape or sexual assault, robbery, aggravated assault, and simple assault.&quot;, locations = cells_column_spanners(spanners = &quot;viol_span&quot;) ) %&gt;% tab_footnote(footnote = &quot;Excludes persons of Hispanic origin&quot;, locations = cells_stub(rows = Level != &quot;Hispanic&quot;)) %&gt;% tab_footnote( footnote = &quot;Inlcudes persons who identified as Native Hawaiian or Other Pacific Islander only.&quot;, locations = cells_stub(rows = Level == NHOPI) ) %&gt;% tab_footnote( footnote = &quot;Inlcudes persons who identified as American Indian or Alaska Native only or as two or more races.&quot;, locations = cells_stub(rows = Level == &quot;Other&quot;) ) %&gt;% tab_source_note(source_note = &quot;Note: Rates per 1,000 persons age 12 or older.&quot;) %&gt;% tab_source_note(source_note = &quot;Source: Bureau of Justice Statistics, National Crime Victimization Survey, 2021.&quot;) %&gt;% tab_stubhead(label = &quot;Victim demographic&quot;) %&gt;% tab_header(title = &quot;Rate and standard error of violent victimization, by type of crime and demographic characteristics, 2021&quot;) #wvnyhqfyuk table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #wvnyhqfyuk thead, #wvnyhqfyuk tbody, #wvnyhqfyuk tfoot, #wvnyhqfyuk tr, #wvnyhqfyuk td, #wvnyhqfyuk th { border-style: none; } #wvnyhqfyuk p { margin: 0; padding: 0; } #wvnyhqfyuk .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #wvnyhqfyuk .gt_caption { padding-top: 4px; padding-bottom: 4px; } #wvnyhqfyuk .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #wvnyhqfyuk .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #wvnyhqfyuk .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #wvnyhqfyuk .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #wvnyhqfyuk .gt_column_spanner_outer:first-child { padding-left: 0; } #wvnyhqfyuk .gt_column_spanner_outer:last-child { padding-right: 0; } #wvnyhqfyuk .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #wvnyhqfyuk .gt_spanner_row { border-bottom-style: hidden; } #wvnyhqfyuk .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #wvnyhqfyuk .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #wvnyhqfyuk .gt_from_md > :first-child { margin-top: 0; } #wvnyhqfyuk .gt_from_md > :last-child { margin-bottom: 0; } #wvnyhqfyuk .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #wvnyhqfyuk .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #wvnyhqfyuk .gt_row_group_first td { border-top-width: 2px; } #wvnyhqfyuk .gt_row_group_first th { border-top-width: 2px; } #wvnyhqfyuk .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #wvnyhqfyuk .gt_first_summary_row.thick { border-top-width: 2px; } #wvnyhqfyuk .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #wvnyhqfyuk .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #wvnyhqfyuk .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #wvnyhqfyuk .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #wvnyhqfyuk .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #wvnyhqfyuk .gt_left { text-align: left; } #wvnyhqfyuk .gt_center { text-align: center; } #wvnyhqfyuk .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #wvnyhqfyuk .gt_font_normal { font-weight: normal; } #wvnyhqfyuk .gt_font_bold { font-weight: bold; } #wvnyhqfyuk .gt_font_italic { font-style: italic; } #wvnyhqfyuk .gt_super { font-size: 65%; } #wvnyhqfyuk .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #wvnyhqfyuk .gt_asterisk { font-size: 100%; vertical-align: 0; } #wvnyhqfyuk .gt_indent_1 { text-indent: 5px; } #wvnyhqfyuk .gt_indent_2 { text-indent: 10px; } #wvnyhqfyuk .gt_indent_3 { text-indent: 15px; } #wvnyhqfyuk .gt_indent_4 { text-indent: 20px; } #wvnyhqfyuk .gt_indent_5 { text-indent: 25px; } Rate and standard error of violent victimization, by type of crime and demographic characteristics, 2021 Victim demographic Violent crime1 Aggravated assault Rate SE Rate SE Sex Female2 15.5 0.9 2.3 0.2 Male2 17.5 1.1 3.2 0.3 Race/Hispanic origin White2 16.1 0.9 2.7 0.3 Black2 18.5 2.2 3.7 0.7 Hispanic 15.9 1.7 2.3 0.4 Asian2 8.6 1.3 1.9 0.6 Native Hawaiian or Other Pacific Islander2,3 36.1 34.4 0.0 0.0 Other2,4 45.4 13.0 6.2 2.0 Age 12-172 13.2 2.2 2.5 0.8 18-242 23.1 2.1 3.9 0.9 25-342 22.0 2.1 4.0 0.6 35-492 19.4 1.6 3.6 0.5 50-642 16.9 1.9 2.0 0.3 65 or older2 6.4 1.1 1.1 0.3 Marital status Never married2 22.2 1.4 4.0 0.4 Married2 9.5 0.9 1.5 0.2 Widowed2 10.7 3.5 0.9 0.2 Divorced2 27.4 2.9 4.0 0.7 Separated2 36.8 6.7 8.8 3.1 Income Less than $25,0002 29.6 2.5 5.1 0.7 $25,000-49,9992 16.9 1.5 3.0 0.4 $50,000-99,9992 14.6 1.1 1.9 0.3 $100,000-199,9992 12.2 1.3 2.5 0.4 $200,000 or more2 9.7 1.4 1.7 0.6 Note: Rates per 1,000 persons age 12 or older. Source: Bureau of Justice Statistics, National Crime Victimization Survey, 2021. 1 Includes rape or sexual assault, robbery, aggravated assault, and simple assault. 2 Excludes persons of Hispanic origin 3 Inlcudes persons who identified as Native Hawaiian or Other Pacific Islander only. 4 Inlcudes persons who identified as American Indian or Alaska Native only or as two or more races. 9.6.4 Prevalence rates Prevalence rates are different from victimization rates as the numerator is the number of people or households who are victimized rather than the number of victimizations. To calculate the prevalence rates, another summary of the data must be done, which is calculating an indicator for whether a person or household is a victim of a particular crime at any point in the year. Below is an example of calculating first the indicator and then the prevalence rate of violent crime and aggravated assault. pers_prev_des &lt;- pers_des %&gt;% mutate(Year = floor(YEARQ)) %&gt;% group_by(Year, IDHH, IDPER) %&gt;% mutate(Violent_Ind = sum(Violent) &gt; 0, AAST_Ind = sum(AAST) &gt; 0) %&gt;% ungroup() pers_prev_ests &lt;- pers_prev_des %&gt;% summarize(Violent_Prev = survey_mean(Violent_Ind * 100), AAST_Prev = survey_mean(AAST_Ind * 100)) pers_prev_ests ## # A tibble: 1 × 4 ## Violent_Prev Violent_Prev_se AAST_Prev AAST_Prev_se ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 0.980 0.0349 0.215 0.0143 In the example above, the indicator is multiplied by 100 to return a percentage rather than a proportion. In 2021, we estimate that 0.98% of people aged 12 and older were a victim of violent crime in the United States, and 0.22% were victims of aggravated assault. 9.7 Exercises What proportion of completed motor vehicle thefts are not reported to the police? Hint: Use the codebook to look at the definition of Type of Crime (V4529). ans1 &lt;- inc_des %&gt;% filter(str_detect(V4529, &quot;40|41&quot;)) %&gt;% group_by(ReportPolice) %&gt;% summarize(Pct = survey_prop(na.rm = TRUE) * 100) ans1 ## # A tibble: 2 × 3 ## ReportPolice Pct Pct_se ## &lt;lgl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 FALSE 23.1 2.60 ## 2 TRUE 76.9 2.60 ans1 %&gt;% filter(!ReportPolice) %&gt;% pull(Pct) ## [1] 23.11 How many violent crimes occur in each region? inc_des %&gt;% filter(Violent) %&gt;% survey_count(Region) ## # A tibble: 4 × 3 ## Region n n_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Northeast 698406. 82419. ## 2 Midwest 1144407. 95860. ## 3 South 1394214. 107505. ## 4 West 1361278. 109479. What is the property victimization rate among each income level? hh_des %&gt;% group_by(Income) %&gt;% summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000, na.rm = TRUE)) ## # A tibble: 6 × 3 ## Income Property_Rate Property_Rate_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Less than $25,000 111. 4.97 ## 2 $25,000-49,999 89.5 3.42 ## 3 $50,000-99,999 87.8 3.30 ## 4 $100,000-199,999 76.5 3.49 ## 5 $200,000 or more 91.8 5.69 ## 6 &lt;NA&gt; NaN NaN References "],["c10-ambarom-vignette.html", "Chapter 10 AmericasBarometer Vignette 10.1 Introduction 10.2 Data Structure 10.3 Preparing files 10.4 Survey design objects 10.5 Calculating estimates and making tables 10.6 Mapping survey data 10.7 Exercises", " Chapter 10 AmericasBarometer Vignette Prerequisites For this chapter, here are the libraries and helper functions we will need: library(tidyverse) library(srvyr) library(sf) library(rnaturalearth) # Getting world maps library(rnaturalearthdata) library(gt) library(ggpattern) library(osfr) source(&quot;helper-fun/helper-functions.R&quot;) We will be using data from the AmericasBarometer surveys. Here is the code to read in the dataset that we will be working with: ambarom_in &lt;- read_osf(&quot;lapop_2021.rds&quot;) 10.1 Introduction The AmericasBarometer surveys are conducted by the LAPOP Lab. These surveys are public opinion surveys of the Americas focused on democracy. The study was launched in 2004/2005 with 11 countries, with the countries growing and fluctuating over time, and creates a study with consistent methodology across many countries. In 2021, the study included 22 countries ranging from the north in Canada to the South in Chile and Argentina30. Historically, surveys were administered with face-to-face household interviews, but the COVID-19 pandemic changed the study significantly to the use of random-digit dialing (RDD) of mobile phones in all countries except the United States and Canada31. In Canada, LAPOP collaborated with the Environics Institute to collect data from a panel of Canadians using a web survey32. While in the United States, YouGov conducted the survey on behalf of LAPOP by conducting a web survey among their panelists33. The survey has a core set of questions across the countries, but not all questions are asked everywhere. Additionally, some questions are only asked to half of the respondents within a country, presumably to reduce the burden as different sections are randomized to different respondents.34 10.2 Data Structure Each country and each year has its own files. The data used in this vignette can be downloaded from the LAPOP website. In this vignette, we will be using data from 2021, namely version v1.2. These are not available on the book’s repository, but you may download the raw files yourself35 (“The AmericasBarometer by the LAPOP Lab” (2023)). To read all files into R and ignore the Stata labels, we recommend running code like this: stata_files &lt;- list.files(here(&quot;RawData&quot;, &quot;LAPOP_2021&quot;), &quot;*.dta&quot;) read_stata_unlabeled &lt;- function(file) { read_stata(file) %&gt;% zap_labels() %&gt;% zap_label() } lapop_in &lt;- here(&quot;RawData&quot;, &quot;LAPOP_2021&quot;, stata_files) %&gt;% map_df(read_stata_unlabeled) The code above will read all files of type dta in and stack them into one tibble. We did this and then selected a subset of variables for this vignette. To understand variables that are used across the several countries, the core questionnaire is useful.36 10.3 Preparing files Many of the variables are coded as numeric and do not have intuitive variable names, so the next step is to create derived variables and analysis-ready data. Using the core questionnaire as a codebook, derived variables are created below with relevant factors with informative names. ambarom &lt;- ambarom_in %&gt;% mutate( Country = factor( case_match( pais, 1 ~ &quot;Mexico&quot;, 2 ~ &quot;Guatemala&quot;, 3 ~ &quot;El Salvador&quot;, 4 ~ &quot;Honduras&quot;, 5 ~ &quot;Nicaragua&quot;, 6 ~ &quot;Costa Rica&quot;, 7 ~ &quot;Panama&quot;, 8 ~ &quot;Colombia&quot;, 9 ~ &quot;Ecuador&quot;, 10 ~ &quot;Bolivia&quot;, 11 ~ &quot;Peru&quot;, 12 ~ &quot;Paraguay&quot;, 13 ~ &quot;Chile&quot;, 14 ~ &quot;Uruguay&quot;, 15 ~ &quot;Brazil&quot;, 17 ~ &quot;Argentina&quot;, 21 ~ &quot;Dominican Republic&quot;, 22 ~ &quot;Haiti&quot;, 23 ~ &quot;Jamaica&quot;, 24 ~ &quot;Guyana&quot;, 40 ~ &quot;United States&quot;, 41 ~ &quot;Canada&quot; ) ), Gender = fct_reorder( case_match(q1tb, 1 ~ &quot;Male&quot;, 2 ~ &quot;Female&quot;, 3 ~ &quot;Other&quot;), q1tb, .na_rm = FALSE ), CovidWorry = fct_reorder( case_match( covid2at, 1 ~ &quot;Very worried&quot;, 2 ~ &quot;Somewhat worried&quot;, 3 ~ &quot;A little worried&quot;, 4 ~ &quot;Not worried at all&quot; ), covid2at, .na_rm = FALSE ), EconSituation = fct_reorder( case_match(idio2, 1 ~ &quot;Better&quot;, 2 ~ &quot;Same&quot;, 3 ~ &quot;Worse&quot;), idio2, .na_rm = FALSE ), EconSituationWorse_Reason = fct_reorder( case_match(idio2cov, 1 ~ &quot;Coronavirus&quot;, 2 ~ &quot;Another reason&quot;), idio2cov, .na_rm = FALSE ), CommunityTrustworthy = fct_reorder( case_match( it1, 1 ~ &quot;Very trustworthy&quot;, 2 ~ &quot;Somewhat trustworthy&quot;, 3 ~ &quot;Not very trustworthy&quot;, 4 ~ &quot;Untrustworthy&quot; ), it1, .na_rm = FALSE ), CoupCorruption = fct_reorder( case_match(jc13, 1 ~ &quot;Justified&quot;, 2 ~ &quot;Not justified&quot;), jc13, .na_rm = FALSE ), LeaderApproval = fct_reorder( case_match( m1, 1 ~ &quot;Very good&quot;, 2 ~ &quot;Good&quot;, 3 ~ &quot;Neither good nor bad (fair)&quot;, 4 ~ &quot;Bad&quot;, 5 ~ &quot;Very bad&quot; ), m1, .na_rm = FALSE ), Employment = fct_reorder( case_match( ocup4a, c(1, 2) ~ &quot;Working&quot;, 3 ~ &quot;Looking for a job&quot;, 4 ~ &quot;Student&quot;, 5 ~ &quot;Homemaker&quot;, 6 ~ &quot;Retired or disabled&quot;, 7 ~ &quot;Not working, not looking for job&quot; ), ocup4a, .na_rm = FALSE ), IntentionMigrate = fct_reorder(case_match(q14, 1 ~ &quot;Yes&quot;, 2 ~ &quot;No&quot;), q14, .na_rm = FALSE), NewsFrequency = fct_reorder( case_match( gi0n, 1 ~ &quot;Daily&quot;, 2 ~ &quot;Few times a week&quot;, 3 ~ &quot;Few times a month&quot;, 4 ~ &quot;Few times a year&quot;, 5 ~ &quot;Never&quot; ), gi0n, .na_rm = FALSE ) ) %&gt;% rename( Educ_NotInSchool = covidedu1_1, Educ_NormalSchool = covidedu1_2, Educ_VirtualSchool = covidedu1_3, Educ_Hybrid = covidedu1_4, Educ_NoSchool = covidedu1_5, Age = q2, HHSize = q12c, HHChildren = q12bn, ComputerTablet = r15, BroadbandInternet = r18n, Internet = r18 ) ambarom %&gt;% count(Country, pais) %&gt;% print(n = 22) ## # A tibble: 22 × 3 ## Country pais n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Argentina 17 3011 ## 2 Bolivia 10 3002 ## 3 Brazil 15 3016 ## 4 Canada 41 2201 ## 5 Chile 13 2954 ## 6 Colombia 8 2993 ## 7 Costa Rica 6 2977 ## 8 Dominican Republic 21 3000 ## 9 Ecuador 9 3005 ## 10 El Salvador 3 3245 ## 11 Guatemala 2 3000 ## 12 Guyana 24 3011 ## 13 Haiti 22 3088 ## 14 Honduras 4 2999 ## 15 Jamaica 23 3121 ## 16 Mexico 1 2998 ## 17 Nicaragua 5 2997 ## 18 Panama 7 3183 ## 19 Paraguay 12 3004 ## 20 Peru 11 3038 ## 21 United States 40 1500 ## 22 Uruguay 14 3009 ambarom %&gt;% count(Gender, q1tb) ## # A tibble: 4 × 3 ## Gender q1tb n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Male 1 30902 ## 2 Female 2 33208 ## 3 Other 3 152 ## 4 &lt;NA&gt; NA 90 ambarom %&gt;% count(EconSituation, idio2) ## # A tibble: 4 × 3 ## EconSituation idio2 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Better 1 7194 ## 2 Same 2 20143 ## 3 Worse 3 34249 ## 4 &lt;NA&gt; NA 2766 ambarom %&gt;% count(EconSituationWorse_Reason, idio2cov) ## # A tibble: 3 × 3 ## EconSituationWorse_Reason idio2cov n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Coronavirus 1 24839 ## 2 Another reason 2 7933 ## 3 &lt;NA&gt; NA 31580 ambarom %&gt;% count(CommunityTrustworthy, it1) ## # A tibble: 5 × 3 ## CommunityTrustworthy it1 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Very trustworthy 1 13299 ## 2 Somewhat trustworthy 2 23233 ## 3 Not very trustworthy 3 18383 ## 4 Untrustworthy 4 5806 ## 5 &lt;NA&gt; NA 3631 ambarom %&gt;% count(CoupCorruption, jc13) ## # A tibble: 3 × 3 ## CoupCorruption jc13 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Justified 1 5138 ## 2 Not justified 2 8387 ## 3 &lt;NA&gt; NA 50827 ambarom %&gt;% count(LeaderApproval, m1) ## # A tibble: 6 × 3 ## LeaderApproval m1 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Very good 1 4377 ## 2 Good 2 7364 ## 3 Neither good nor bad (fair) 3 9731 ## 4 Bad 4 3854 ## 5 Very bad 5 5788 ## 6 &lt;NA&gt; NA 33238 ambarom %&gt;% count(Employment, ocup4a) ## # A tibble: 8 × 3 ## Employment ocup4a n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Working 1 17548 ## 2 Working 2 1594 ## 3 Looking for a job 3 5263 ## 4 Student 4 2430 ## 5 Homemaker 5 4113 ## 6 Retired or disabled 6 2547 ## 7 Not working, not looking for job 7 1352 ## 8 &lt;NA&gt; NA 29505 ambarom %&gt;% count(IntentionMigrate, q14) ## # A tibble: 3 × 3 ## IntentionMigrate q14 n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Yes 1 7992 ## 2 No 2 12230 ## 3 &lt;NA&gt; NA 44130 ambarom %&gt;% count(NewsFrequency, gi0n) ## # A tibble: 6 × 3 ## NewsFrequency gi0n n ## &lt;fct&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Daily 1 34642 ## 2 Few times a week 2 21804 ## 3 Few times a month 3 3434 ## 4 Few times a year 4 810 ## 5 Never 5 2422 ## 6 &lt;NA&gt; NA 1240 10.4 Survey design objects The technical report is the best source to understand how to specify the sampling design in R37. The data includes two weights: wt and weight1500. The first weight variable is country-specific and sums to the sample size but is calibrated to reflect each country’s demographics, while the second weight variable sums to 1500 for each country. The second weight is indicated as the weight to use for multi-country analyses. While the documentation does not directly state this, the example Stata syntax svyset upm [pw=weight1500], strata(strata) indicates the variable upm is a clustering variable, and strata is the strata variable. The design object is setup as follows: ambarom_des &lt;- ambarom %&gt;% as_survey_design(ids = upm, strata = strata, weight = weight1500) One interesting thing to note is that these can only give us estimates to compare countries but not multi-country estimates since the weights do not account for different sizes of countries. For example, Canada has about 10% of the population of the United States, but an estimate that uses records from both countries would weigh them equally. 10.5 Calculating estimates and making tables This survey was administered in 2021 between March and August, varying by country38. Given the state of the pandemic at that time, several questions about COVID were included. The first question about COVID asked whether people were worried about the possibility that they or someone in their household will get sick from coronavirus in the next three months. We will calculate the percentage of people in each country who are very worried or somewhat worried. In the following code, estimates are calculated, and then a table of the estimates is created using the {{gt}} package. covid_worry_country_ests &lt;- ambarom_des %&gt;% mutate(CovidWorry_bin = fct_collapse( CovidWorry, WorriedHi = c(&quot;Very worried&quot;, &quot;Somewhat worried&quot;), WorriedLo = c(&quot;A little worried&quot;, &quot;Not worried at all&quot;) )) %&gt;% group_by(Country) %&gt;% summarize(p = survey_mean(CovidWorry_bin == &quot;WorriedHi&quot;, na.rm = TRUE) * 100) covid_worry_country_ests %&gt;% gt(rowname_col = &quot;Country&quot;) %&gt;% cols_label(p = &quot;Percent&quot;, p_se = &quot;SE&quot;) %&gt;% tab_header(title = &quot;Proportion worried about the possibility that they or someone in their household will get sick from coronavirus in the next 3 months&quot;) %&gt;% fmt_number(decimals = 1) #qivcbyivmm table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #qivcbyivmm thead, #qivcbyivmm tbody, #qivcbyivmm tfoot, #qivcbyivmm tr, #qivcbyivmm td, #qivcbyivmm th { border-style: none; } #qivcbyivmm p { margin: 0; padding: 0; } #qivcbyivmm .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #qivcbyivmm .gt_caption { padding-top: 4px; padding-bottom: 4px; } #qivcbyivmm .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #qivcbyivmm .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #qivcbyivmm .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #qivcbyivmm .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #qivcbyivmm .gt_column_spanner_outer:first-child { padding-left: 0; } #qivcbyivmm .gt_column_spanner_outer:last-child { padding-right: 0; } #qivcbyivmm .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #qivcbyivmm .gt_spanner_row { border-bottom-style: hidden; } #qivcbyivmm .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #qivcbyivmm .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #qivcbyivmm .gt_from_md > :first-child { margin-top: 0; } #qivcbyivmm .gt_from_md > :last-child { margin-bottom: 0; } #qivcbyivmm .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #qivcbyivmm .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #qivcbyivmm .gt_row_group_first td { border-top-width: 2px; } #qivcbyivmm .gt_row_group_first th { border-top-width: 2px; } #qivcbyivmm .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #qivcbyivmm .gt_first_summary_row.thick { border-top-width: 2px; } #qivcbyivmm .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #qivcbyivmm .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #qivcbyivmm .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #qivcbyivmm .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #qivcbyivmm .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #qivcbyivmm .gt_left { text-align: left; } #qivcbyivmm .gt_center { text-align: center; } #qivcbyivmm .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #qivcbyivmm .gt_font_normal { font-weight: normal; } #qivcbyivmm .gt_font_bold { font-weight: bold; } #qivcbyivmm .gt_font_italic { font-style: italic; } #qivcbyivmm .gt_super { font-size: 65%; } #qivcbyivmm .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #qivcbyivmm .gt_asterisk { font-size: 100%; vertical-align: 0; } #qivcbyivmm .gt_indent_1 { text-indent: 5px; } #qivcbyivmm .gt_indent_2 { text-indent: 10px; } #qivcbyivmm .gt_indent_3 { text-indent: 15px; } #qivcbyivmm .gt_indent_4 { text-indent: 20px; } #qivcbyivmm .gt_indent_5 { text-indent: 25px; } Proportion worried about the possibility that they or someone in their household will get sick from coronavirus in the next 3 months Percent SE Argentina 65.8 1.1 Bolivia 71.6 1.0 Brazil 83.5 1.0 Canada 48.9 1.3 Chile 81.8 0.8 Colombia 67.9 1.1 Costa Rica 72.6 1.0 Dominican Republic 50.1 1.1 Ecuador 71.7 1.0 El Salvador 52.5 1.0 Guatemala 69.3 1.0 Guyana 60.0 1.6 Haiti 54.4 1.8 Honduras 64.6 1.1 Jamaica 28.4 0.9 Mexico 63.6 1.0 Nicaragua 80.0 1.0 Panama 70.2 1.0 Paraguay 61.5 1.1 Peru 77.1 2.5 United States 46.6 1.7 Uruguay 60.9 1.1 Another question asked how education was affected by the pandemic. This question was asked among households with children under the age of 13, and respondents could select more than one option as follows: Did any of these children have their school education affected due to the pandemic? No, because they are not yet school age or because they do not attend school for another reason No, their classes continued normally Yes, they went to virtual or remote classes Yes, they switched to a combination of virtual and in-person classes Yes, they cut all ties with the school Multiple-choice questions are interesting. If we want to look at how education was impacted only among those in school, we need to filter to the relevant responses, which is anyone that responded no to the first part. An unweighted cross-tab for the responses is included below, and we can see there is a wide-range of impacts and that many combinations of effects on education are possible. ambarom %&gt;% filter(Educ_NotInSchool == 0) %&gt;% count(Educ_NormalSchool, Educ_VirtualSchool, Educ_Hybrid) %&gt;% print(n = 50) ## # A tibble: 8 × 4 ## Educ_NormalSchool Educ_VirtualSchool Educ_Hybrid n ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 0 0 0 861 ## 2 0 0 1 1192 ## 3 0 1 0 7554 ## 4 0 1 1 280 ## 5 1 0 0 833 ## 6 1 0 1 18 ## 7 1 1 0 72 ## 8 1 1 1 7 We might create multiple outcomes for a table as follows: Indicator that school continued as normal with no virtual or hybrid option Indicator that the education medium was changed - either virtual or hybrid We create these variables, make national estimates, and a summary table. ambarom_des_educ &lt;- ambarom_des %&gt;% filter(Educ_NotInSchool == 0) %&gt;% mutate( Educ_OnlyNormal = Educ_NormalSchool == 1 &amp; Educ_VirtualSchool == 0 &amp; Educ_Hybrid == 0, Educ_MediumChange = Educ_VirtualSchool == 1 | Educ_Hybrid == 1, ) covid_educ_ests &lt;- ambarom_des_educ %&gt;% group_by(Country) %&gt;% summarize( p_onlynormal = survey_mean(Educ_OnlyNormal, na.rm = TRUE) * 100, p_mediumchange = survey_mean(Educ_MediumChange, na.rm = TRUE) * 100, p_noschool = survey_mean(Educ_NoSchool, na.rm = TRUE) * 100, ) covid_educ_ests %&gt;% gt(rowname_col = &quot;Country&quot;) %&gt;% cols_label( p_onlynormal = &quot;%&quot;, p_onlynormal_se = &quot;SE&quot;, p_mediumchange = &quot;%&quot;, p_mediumchange_se = &quot;SE&quot;, p_noschool = &quot;%&quot;, p_noschool_se = &quot;SE&quot; ) %&gt;% tab_spanner(label = &quot;Normal school only&quot;, columns = c(&quot;p_onlynormal&quot;, &quot;p_onlynormal_se&quot;)) %&gt;% tab_spanner(label = &quot;Medium change&quot;, columns = c(&quot;p_mediumchange&quot;, &quot;p_mediumchange_se&quot;)) %&gt;% tab_spanner(label = &quot;Cut ties with school&quot;, columns = c(&quot;p_noschool&quot;, &quot;p_noschool_se&quot;)) %&gt;% fmt_number(decimals = 1) #dvpqxzjwgf table { font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; } #dvpqxzjwgf thead, #dvpqxzjwgf tbody, #dvpqxzjwgf tfoot, #dvpqxzjwgf tr, #dvpqxzjwgf td, #dvpqxzjwgf th { border-style: none; } #dvpqxzjwgf p { margin: 0; padding: 0; } #dvpqxzjwgf .gt_table { display: table; border-collapse: collapse; line-height: normal; margin-left: auto; margin-right: auto; color: #333333; font-size: 16px; font-weight: normal; font-style: normal; background-color: #FFFFFF; width: auto; border-top-style: solid; border-top-width: 2px; border-top-color: #A8A8A8; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #A8A8A8; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; } #dvpqxzjwgf .gt_caption { padding-top: 4px; padding-bottom: 4px; } #dvpqxzjwgf .gt_title { color: #333333; font-size: 125%; font-weight: initial; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; border-bottom-color: #FFFFFF; border-bottom-width: 0; } #dvpqxzjwgf .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; padding-top: 3px; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; border-top-color: #FFFFFF; border-top-width: 0; } #dvpqxzjwgf .gt_heading { background-color: #FFFFFF; text-align: center; border-bottom-color: #FFFFFF; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 6px; padding-left: 5px; padding-right: 5px; overflow-x: hidden; } #dvpqxzjwgf .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: normal; text-transform: inherit; padding-top: 0; padding-bottom: 0; padding-left: 4px; padding-right: 4px; } #dvpqxzjwgf .gt_column_spanner_outer:first-child { padding-left: 0; } #dvpqxzjwgf .gt_column_spanner_outer:last-child { padding-right: 0; } #dvpqxzjwgf .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: bottom; padding-top: 5px; padding-bottom: 5px; overflow-x: hidden; display: inline-block; width: 100%; } #dvpqxzjwgf .gt_spanner_row { border-bottom-style: hidden; } #dvpqxzjwgf .gt_group_heading { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; text-align: left; } #dvpqxzjwgf .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; vertical-align: middle; } #dvpqxzjwgf .gt_from_md > :first-child { margin-top: 0; } #dvpqxzjwgf .gt_from_md > :last-child { margin-bottom: 0; } #dvpqxzjwgf .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; margin: 10px; border-top-style: solid; border-top-width: 1px; border-top-color: #D3D3D3; border-left-style: none; border-left-width: 1px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 1px; border-right-color: #D3D3D3; vertical-align: middle; overflow-x: hidden; } #dvpqxzjwgf .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_stub_row_group { color: #333333; background-color: #FFFFFF; font-size: 100%; font-weight: initial; text-transform: inherit; border-right-style: solid; border-right-width: 2px; border-right-color: #D3D3D3; padding-left: 5px; padding-right: 5px; vertical-align: top; } #dvpqxzjwgf .gt_row_group_first td { border-top-width: 2px; } #dvpqxzjwgf .gt_row_group_first th { border-top-width: 2px; } #dvpqxzjwgf .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_first_summary_row { border-top-style: solid; border-top-color: #D3D3D3; } #dvpqxzjwgf .gt_first_summary_row.thick { border-top-width: 2px; } #dvpqxzjwgf .gt_last_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-top-style: double; border-top-width: 6px; border-top-color: #D3D3D3; } #dvpqxzjwgf .gt_last_grand_summary_row_top { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; padding-right: 5px; border-bottom-style: double; border-bottom-width: 6px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_striped { background-color: rgba(128, 128, 128, 0.05); } #dvpqxzjwgf .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } #dvpqxzjwgf .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_footnote { margin: 0px; font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; border-bottom-width: 2px; border-bottom-color: #D3D3D3; border-left-style: none; border-left-width: 2px; border-left-color: #D3D3D3; border-right-style: none; border-right-width: 2px; border-right-color: #D3D3D3; } #dvpqxzjwgf .gt_sourcenote { font-size: 90%; padding-top: 4px; padding-bottom: 4px; padding-left: 5px; padding-right: 5px; } #dvpqxzjwgf .gt_left { text-align: left; } #dvpqxzjwgf .gt_center { text-align: center; } #dvpqxzjwgf .gt_right { text-align: right; font-variant-numeric: tabular-nums; } #dvpqxzjwgf .gt_font_normal { font-weight: normal; } #dvpqxzjwgf .gt_font_bold { font-weight: bold; } #dvpqxzjwgf .gt_font_italic { font-style: italic; } #dvpqxzjwgf .gt_super { font-size: 65%; } #dvpqxzjwgf .gt_footnote_marks { font-size: 75%; vertical-align: 0.4em; position: initial; } #dvpqxzjwgf .gt_asterisk { font-size: 100%; vertical-align: 0; } #dvpqxzjwgf .gt_indent_1 { text-indent: 5px; } #dvpqxzjwgf .gt_indent_2 { text-indent: 10px; } #dvpqxzjwgf .gt_indent_3 { text-indent: 15px; } #dvpqxzjwgf .gt_indent_4 { text-indent: 20px; } #dvpqxzjwgf .gt_indent_5 { text-indent: 25px; } Normal school only Medium change Cut ties with school % SE % SE % SE Argentina 5.4 1.1 87.1 1.7 9.9 1.6 Brazil 4.3 1.2 81.5 2.3 22.1 2.5 Chile 0.7 0.3 96.2 1.0 4.0 1.0 Colombia 2.8 0.7 90.3 1.4 7.5 1.3 Dominican Republic 3.8 0.8 87.4 1.5 10.5 1.4 Ecuador 5.2 1.0 87.5 1.4 7.9 1.1 El Salvador 2.9 0.7 85.8 1.5 11.8 1.4 Guatemala 3.0 0.7 82.2 1.7 17.7 1.8 Guyana 3.3 0.7 85.3 1.7 13.0 1.6 Haiti 81.1 2.3 7.2 1.5 11.7 1.8 Honduras 3.7 0.9 80.7 1.7 16.9 1.6 Jamaica 5.4 0.9 88.1 1.4 7.5 1.2 Panama 7.2 1.2 89.4 1.4 3.8 0.9 Paraguay 4.7 0.9 90.7 1.4 6.4 1.2 Peru 2.0 0.6 91.8 1.2 6.8 1.1 Uruguay 8.6 1.4 84.3 2.0 8.0 1.6 Of the countries that used this data, many had households where their children had an education medium change, except Haiti, where only 7.2% of households with students changed to virtual or hybrid learning. 10.6 Mapping survey data While the table presents the data well, a map could also be used. To obtain maps of the countries, the package {{rnaturalearth}} is used, subsetting North and South America using the function ne_countries(). This returns an sf object with many columns but, most importantly soverignt (sovereignty), geounit (country or territory), and geometry (the shape). The United States, Puerto Rico, and the US Virgin Islands are all separate units with the same sovereignty. That map (without data) is plotted. The first map is very wide as the Aleutian islands in Alaska extend into the Eastern Hemisphere. The shape file is cropped to only the Western Hemisphere to remove some of the trailing islands of Alaska and then plotted. country_shape &lt;- ne_countries( scale = &quot;medium&quot;, returnclass = &quot;sf&quot;, continent = c(&quot;North America&quot;, &quot;South America&quot;) ) country_shape %&gt;% ggplot() + geom_sf() ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.1: Map of North and South America Then, using the anti_join() function, it is verified that all countries in the survey data are also in the map data. As shown below, the United States is referred to as “United States” in the survey data but “United States of America” in the map data. survey_country_list &lt;- ambarom %&gt;% distinct(Country) survey_country_list %&gt;% anti_join(country_shape, by = c(&quot;Country&quot; = &quot;geounit&quot;)) ## # A tibble: 1 × 1 ## Country ## &lt;fct&gt; ## 1 United States country_shape %&gt;% as_tibble() %&gt;% select(geounit, sovereignt) %&gt;% anti_join(survey_country_list, by = c(&quot;geounit&quot; = &quot;Country&quot;)) %&gt;% arrange(geounit) %&gt;% print(n = 30) ## # A tibble: 30 × 2 ## geounit sovereignt ## &lt;chr&gt; &lt;chr&gt; ## 1 Anguilla United Kingdom ## 2 Antigua and Barbuda Antigua and Barbuda ## 3 Aruba Netherlands ## 4 Barbados Barbados ## 5 Belize Belize ## 6 Bermuda United Kingdom ## 7 British Virgin Islands United Kingdom ## 8 Cayman Islands United Kingdom ## 9 Cuba Cuba ## 10 Curaçao Netherlands ## 11 Dominica Dominica ## 12 Falkland Islands United Kingdom ## 13 Greenland Denmark ## 14 Grenada Grenada ## 15 Montserrat United Kingdom ## 16 Puerto Rico United States of America ## 17 Saint Barthelemy France ## 18 Saint Kitts and Nevis Saint Kitts and Nevis ## 19 Saint Lucia Saint Lucia ## 20 Saint Martin France ## 21 Saint Pierre and Miquelon France ## 22 Saint Vincent and the Grenadines Saint Vincent and the Grenadines ## 23 Sint Maarten Netherlands ## 24 Suriname Suriname ## 25 The Bahamas The Bahamas ## 26 Trinidad and Tobago Trinidad and Tobago ## 27 Turks and Caicos Islands United Kingdom ## 28 United States Virgin Islands United States of America ## 29 United States of America United States of America ## 30 Venezuela Venezuela With the mismatched names, there are several ways to remedy the data to join later. The most straightforward fix is to rename the shape object’s data before merging. We then can plot the survey estimates after merging the data. country_shape_upd &lt;- country_shape %&gt;% mutate(geounit = if_else(geounit == &quot;United States of America&quot;, &quot;United States&quot;, geounit)) %&gt;% st_crop(c( xmin = -180, xmax = 0, ymin = -90, ymax = 90 )) ## Warning: attribute variables are assumed to be spatially constant ## throughout all geometries To merge the data and make a map, we begin with the map file, merge the estimates data, and then plot as shown below for the outcomes we have used above in tables. covid_sf &lt;- country_shape_upd %&gt;% full_join(covid_worry_country_ests, by = c(&quot;geounit&quot; = &quot;Country&quot;)) %&gt;% full_join(covid_educ_ests, by = c(&quot;geounit&quot; = &quot;Country&quot;)) ggplot() + geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_sf, is.na(p)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.2: Percent of people worried someone in their household will get COVID-19 in the next 3 months by country ggplot() + geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_sf, is.na(p_mediumchange)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.3: Percent of students who participated in virtual or hybrid learning Canada, Mexico, and the United States did not include this question, so removing North America from the map may make sense to focus on Central and South America. This is done below by restricting to Latin America and the Caribbean. covid_c_s &lt;- covid_sf %&gt;% filter(region_wb == &quot;Latin America &amp; Caribbean&quot;) ggplot() + geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(covid_c_s, is.na(p_mediumchange)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function FIGURE 10.4: Percent of students who participated in virtual or hybrid learning, Central and South America 10.7 Exercises Calculate the percentage of households with broadband internet and those with any internet at home, including from phone or tablet. Hint: if you see countries with 0% Internet usage, you may want to filter by something first. int_ests &lt;- ambarom_des %&gt;% filter(!is.na(Internet) | !is.na(BroadbandInternet)) %&gt;% group_by(Country) %&gt;% summarize( p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100, p_internet = survey_mean(Internet, na.rm = TRUE) * 100 ) int_ests %&gt;% print(n = 30) ## # A tibble: 20 × 5 ## Country p_broadband p_broadband_se p_internet p_internet_se ## &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 Argentina 62.3 1.13 86.2 0.871 ## 2 Bolivia 41.4 1.03 77.2 0.956 ## 3 Brazil 68.3 1.25 88.9 0.879 ## 4 Chile 63.1 1.06 93.5 0.550 ## 5 Colombia 45.7 1.15 68.7 1.09 ## 6 Costa Rica 49.6 1.07 84.4 0.798 ## 7 Dominican Republ… 37.1 1.04 73.7 1.05 ## 8 Ecuador 59.7 1.06 79.9 0.898 ## 9 El Salvador 30.2 0.906 63.9 0.985 ## 10 Guatemala 33.4 0.993 61.5 1.08 ## 11 Guyana 63.7 1.09 86.8 0.781 ## 12 Haiti 11.8 0.791 58.5 1.25 ## 13 Honduras 28.2 0.968 60.7 1.11 ## 14 Jamaica 64.2 0.986 91.5 0.602 ## 15 Mexico 44.9 1.05 70.9 1.05 ## 16 Nicaragua 39.1 1.12 76.3 1.09 ## 17 Panama 43.4 1.02 73.1 0.976 ## 18 Paraguay 33.3 0.971 72.9 1.01 ## 19 Peru 42.4 1.07 71.1 1.07 ## 20 Uruguay 62.7 1.08 90.6 0.699 Make a faceted map showing both broadband internet and any internet usage. internet_sf &lt;- country_shape_upd %&gt;% full_join(select(int_ests, p = p_internet, geounit = Country), by = &quot;geounit&quot;) %&gt;% mutate(Type = &quot;Internet&quot;) broadband_sf &lt;- country_shape_upd %&gt;% full_join(select(int_ests, p = p_broadband, geounit = Country), by = &quot;geounit&quot;) %&gt;% mutate(Type = &quot;Broadband&quot;) b_int_sf &lt;- internet_sf %&gt;% bind_rows(broadband_sf) %&gt;% filter(region_wb == &quot;Latin America &amp; Caribbean&quot;) b_int_sf %&gt;% ggplot(aes(fill = p)) + geom_sf() + facet_wrap( ~ Type) + scale_fill_gradientn( guide = &quot;colourbar&quot;, name = &quot;Percent&quot;, labels = scales::comma, colours = c(&quot;#BFD7EA&quot;, &quot;#087E8B&quot;, &quot;#0B3954&quot;), na.value = NA ) + geom_sf_pattern( data = filter(b_int_sf, is.na(p)), pattern = &quot;crosshatch&quot;, pattern_fill = &quot;black&quot;, fill = NA ) ## Error in xbuild$layout$panel_params[[1]][[axis]]$get_labels(): attempt to apply non-function References "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]

Topic	Descriptive analysis of survey data
Purpose	purpose-blah
Learning Goals	learning-goals-blah