From 6d9634190d7ce5855f5ff74a24ccb9e3a4d65d8a Mon Sep 17 00:00:00 2001 From: rpowell22 Date: Mon, 7 Aug 2023 18:01:06 -0400 Subject: [PATCH] Add citations for ch2, clean up warnings on build, and update code chunk names to be consistent in each chapter --- 02-overview-surveys.Rmd | 26 +++--- 03-specifying-sample-designs.Rmd | 4 +- ...nderstanding-survey-data-documentation.Rmd | 22 ++--- 05-descriptive-analysis.Rmd | 15 +--- 06-statistical-testing.Rmd | 2 +- 07-modeling.Rmd | 2 +- 08-communicating-results.Rmd | 2 +- 09-ncvs-vignette.Rmd | 2 +- 10-ambarom-vignette.Rmd | 2 +- book.bib | 80 +++++++++++++++++-- css/style.css | 2 +- 11 files changed, 106 insertions(+), 53 deletions(-) diff --git a/02-overview-surveys.Rmd b/02-overview-surveys.Rmd index 58697d17..8ad8a359 100644 --- a/02-overview-surveys.Rmd +++ b/02-overview-surveys.Rmd @@ -1,20 +1,18 @@ # Overview of Surveys {#c02-overview-surveys} - - Developing surveys to gather accurate information about populations involves a more intricate and time-intensive process compared to surveys that use non-random criteria for selecting samples. Researchers can spend months, or even years, developing the study design, questions, and other methods for a single survey to ensure high-quality data is collected. While this book focuses on the analysis methods of complex surveys, understanding the entire survey life cycle can provide a better insight into what types of analyses should be conducted on the data. The *survey life cycle* consists of the stages required to successfully execute a survey project. Each stage influences the timing, costs, and feasibility of the survey, consequently impacting the data collected and how it should be analyzed. The survey life cycle starts with a *research topic or question of interest* (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs. -To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail (e.g., @dillman2014mode, @groves2009survey, @Tourangeau2000psych, @Bradburn2004, @valliant2013practical, and @biemer2003survqual). +To gain a deeper understanding of survey design and implementation, there are many pieces of existing literature that we recommend reviewing in detail [e.g., @dillman2014mode; @groves2009survey; @Tourangeau2000psych; @Bradburn2004; @valliant2013practical; @biemer2003survqual]. ## Pre-Survey Planning {#pre-survey-planning} When starting a survey, there are multiple things to consider. *Errors* are the differences between the true values of the variables being studied and the values obtained through the survey. Each step and decision made before the launch of the survey can impact the types of error that are introduced into the data, which in turn impact how to interpret the results. -Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement (@groves2009survey): +Generally, survey researchers consider there to be seven main sources of error that fall into two major categories of Representation and Measurement [@groves2009survey]: - Representation @@ -27,7 +25,7 @@ Generally, survey researchers consider there to be seven main sources of error t - **Measurement Error**: A mismatch between what the researcher asked and how the respondent answered - **Processing Error**: Edits by the researcher to responses provided by the respondent (e.g., adjustments to data based on illogical responses) -Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the *total survey error*, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results {@tse-doc}. However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money: +Almost every survey will have some errors. Researchers attempt to conduct a survey that reduces the *total survey error*, or the accumulation of all errors that may arise throughout the survey life cycle. By assessing these different types of errors together, researchers can seek strategies to maximize the overall survey quality and improve the reliability and validity of results [@tse-doc]. However, attempts to lower individual sources errors (and therefore total survey error) come at the price of time, resources, and money: - **Sampling Error Tradeoff**: Researchers can increase the sample size to reduce sampling error; however, larger samples can be expensive and time-consuming to find. - **Coverage Error Tradeoff**: Researchers can search for more accurate and updated sampling frames, but they can be difficult to construct or obtain. @@ -53,7 +51,7 @@ Once the researchers have selected the sampling frame, the next step is determin #### Example: Number of Pets in a Household {.unnumbered #overview-design-sampdesign-ex} -Let's use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let's assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households (@harter2016address). To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state). +Let's use a simple example where a researcher is interested in the average number of pets in a household. Our researcher will need to consider the target population for this study. Specifically, are they interested in all households in a given country or household in a more local area (e.g., city or state)? Let's assume our researcher is interested in the number of pets in a U.S. household with at least one adult (18 years old or older). In this case, using a sampling frame of mailing addresses would provide the least coverage error as the frame would closely match our target population. Specifically, our researcher would most likely want to use the Computerized Delivery Sequence File (CDSF), which is a file of mailing addresses that the United States Postal Service (USPS) creates and covers nearly 100% of U.S. households [@harter2016address]. To sample these households, for simplicity, we will use a stratified simple random sample design, where we randomly sample households within each state (i.e., we stratify by state). Throughout this chapter, we will build on this example research question to plan a survey. @@ -66,13 +64,13 @@ With the sampling design decided, researchers can then decide on how to survey t - Computer Assisted Web Interview (CAWI; also known as web or on-line interviewing) - Paper and Pencil Interview (PAPI) -Researchers can use a single mode to collect data or multiple modes (also called *mixed modes*). Using mixed modes can allow for broader reach and increase response rates depending on the target population (@deLeeuw2005, @DeLeeuw_2018, @biemer_choiceplus). For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis. +Researchers can use a single mode to collect data or multiple modes (also called *mixed modes*). Using mixed modes can allow for broader reach and increase response rates depending on the target population [@deLeeuw2005; @DeLeeuw_2018; @biemer_choiceplus]. For example, researchers could both call households to conduct a CATI survey and send mail with a PAPI survey to the household. Using both of these modes, researchers could gain participation through the mail from individuals who do not pick up the phone to unknown numbers or through the phone from individuals who do not open all of their mail. However, mode effects (where responses differ based on the mode of response) can be present in the data and may need to be considered during analysis. When selecting which mode, or modes, to use, understanding the unique aspects of the chosen target population and sampling frame will provide insight into how they can best be reached and engaged. For example, if we plan to survey adults aged 18-24 who live in North Carolina, asking them to complete a survey using CATI (i.e., over the phone) would most likely not be as successful as other modes like the web. This age group does not talk on the phone as much as other generations, and often do not answer their phones for unknown numbers. Additionally, the mode for contacting respondents relies on what information is available on the sampling frame. For example, if our sampling frame includes an email address, we could email our selected sample members to convince them to complete a survey. Or if the sampling frame is a list of mailing addresses, researchers would have to contact sample members with a letter. It is important to note that there can be a difference between the contact and survey modes. For example, if we have a sampling frame with addresses, we can send a letter to our sample members and provide information on how to complete a web survey. Or we could use mixed-mode surveys and send sample members a paper and pencil survey with our letter and also ask them to complete the survey online. Combining different contact modes and different survey modes can be useful in reducing *unit nonresponse error*--where the entire unit (e.g., a household) does not respond to the survey at all--as different sample members may respond better to different contact and survey modes. However, when considering which modes to use, it is important to make access to the survey as easy as possible for sample members to reduce burden and unit nonresponse. -Another way to reduce unit nonresponse error is through varying the language of the contact materials (@dillman2014mode). People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke "urgent" or "important" thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL. +Another way to reduce unit nonresponse error is through varying the language of the contact materials [@dillman2014mode]. People are motivated by different things, so constantly repeating the same message may not be helpful. Instead, mixing up the messaging and the type of contact material the sample member receives can increase response rates and reduce the unit nonresponse error. For example, instead of only sending standard letters, researchers could consider sending mailings that invoke "urgent" or "important" thoughts by sending priority letters or using other delivery services like FedEx, UPS, or DHL. A study timeline may also determine the number and types of contacts. If the timeline is long, then there is a lot of time for follow-ups and varying the message in contact materials. If the timeline is short, then fewer follow-ups can be implemented. Many studies will start with the tailored design method put forth by @dillman2014mode and implement 5 contacts: @@ -112,7 +110,7 @@ Researchers can benefit from the work of others by using questions from other su If a question does not exist in a question bank, researchers can craft their own. When creating their own questions, researchers should start with the research question or topic and attempt to write questions that match the concept. The closer the question asked is to the overall concept, the better *validity* there is. For example, if the researcher wants to know how people consume TV series and movies but only asks a question about how many TVs are in the house, then they would be missing other ways that people watch TV series and movies, such as on other devices or at places outside of the home. As mentioned above, researchers can employ techniques to increase the validity of their questionnaire. For example, questionnaire testing involves conducting a pilot of the survey instrument to identify and fix potential issues before the main survey is conducted. Cognitive interviewing is a technique where researchers walk through the survey with participants, encouraging them to speak their thoughts out loud to uncover how they interpret and understand survey questions. -Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in *measurement error*, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes (e.g., @dillman2014mode, @Fowler1989, @Bradburn2004, @Tourangeau2004spacing). +Additionally, when designing questions, researchers should consider the mode for the survey and adjust language appropriately. In self-administered surveys (e.g., web or mail), respondents can see all the questions and response options, but that is not the case in interviewer-administered surveys (e.g., CATI or CAPI). With interviewer-administered surveys, the response options need to be read aloud to the respondents, so the question may need to be adjusted to allow a better flow to the interview. Additionally, with self-administered surveys, because the respondents are viewing the questionnaire, the formatting of the questions is even more important to ensure accurate measurement. Incorrect formatting or wording can result in *measurement error*, so following best practices or using existing validated questions can reduce error. There are multiple resources to help researchers draft questions for different modes [e.g., @dillman2014mode; @Fowler1989; @Bradburn2004; @Tourangeau2004spacing]. #### Example: Number of Pets in a Household {.unnumbered #overview-design-questionnaire-ex} @@ -170,11 +168,11 @@ To avoid this issue, researchers should consider these possibilities and adjust Researchers can then code the responses from the open-ended box and get a better understanding of the respondent's choice of preferred pet. Interpreting this question becomes easier as researchers no longer need to qualify the results with the choices provided. -This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter @\ref(c04-understanding-survey-data-documentation) provides further details on how to review existing survey documentation to inform our analyses. +This is a very simple example of how the presentation of the question and options can impact the findings. More complex topics and questions will need researchers to thoroughly consider how to mitigate any impacts from the presentation, formatting, wording, and other aspects. As survey analysts, reviewing not only the data but also the wording of the questions is crucial to ensure the results are presented in a manner consistent with the question asked. Chapter \@ref(c04-understanding-survey-data-documentation) provides further details on how to review existing survey documentation to inform our analyses. ## Data Collection {#overview-datacollection} -Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection (@Schouten2018). Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group. +Once the data collection starts, researchers try to stick to the data collection protocol designed during pre-survey planning. However, a good researcher will adjust their plans and adapt as needed to the current progress of data collection [@Schouten2018]. Some extreme examples could be natural disasters that could prevent mail or interviewers from getting to the sample members. Others could be smaller in that something newsworthy occurs that is connected to the survey, so researchers could choose to play this up in communication materials. In addition to these external factors, there could be factors unique to the survey, such as lower response rates for a specific sub-group, so the data collection protocol may need to find ways to improve response rates for that specific group. @@ -196,7 +194,7 @@ Let's return to the question we created to ask about [animal preference](#overvi ### Weighting {#overview-post-weighting} -Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an "analysis weight" variable that combines these adjustments. However, weighting itself can also introduce *adjustment error*, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own (@Valliant2018weights). Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter \@ref(c04-understanding-survey-data-documentation)) and work with the data and analysis weights provided to analyze and interpret survey results correctly. +Weighting can typically be used to address some of the error sources identified in the previous sections. For example, weights may be used to address coverage, sampling, and nonresponse errors. Many published surveys will include an "analysis weight" variable that combines these adjustments. However, weighting itself can also introduce *adjustment error*, so researchers need to balance which types of errors should be corrected with weighting. The construction of weights is outside the scope of this book, and researchers should reference other materials if interested in constructing their own [@Valliant2018weights]. Instead, this book assumes the survey has been completed, weights are constructed, and data is made available for users. We will walk users through how to read the documentation (Chapter \@ref(c04-understanding-survey-data-documentation)) and work with the data and analysis weights provided to analyze and interpret survey results correctly. #### Example: Number of Pets in a Household {.unnumbered #overview-post-weighting-ex} @@ -205,13 +203,13 @@ In the simple example of our survey, we decided to use a stratified sample by st ### Disclosure {#overview-post-disclosure} Before data is made publicly available, researchers will need to ensure that individual respondents can not be identified by the data when confidentiality is required. There are a variety of different methods that can be used, including *data swapping*, *top or bottom coding*, *coarsening*, and *perturbation.* In data swapping, researchers may swap specific data values across different respondents so that it does not impact insights from the data but ensures that specific individuals cannot be identified. For extreme values, top and bottom coding is sometimes used. For example, researchers may top-code income values such that households with income greater than \$99,999,999 are coded into a single category of \$99,999,999 or more. Other disclosure methods may include aggregating response categories or location information to avoid having only a few respondents in a given group and thus be identified. For example, researchers may use coarsening to display income in categories instead of as a continuous variable. Data producers may also perturb the data by adding random noise. There is as much art as there is a science to the methods used for disclosure, and in documentation, researchers should only provide high-level comments that disclosure was conducted and not specific details to ensure nobody can reverse the disclosure and thus identify individuals. For more information on different disclosure methods, please see @Skinner2009 and -AAPOR Standards^[https://www-archive.aapor.org/Standards-Ethics/AAPOR-Code-of-Ethics/Survey-Disclosure-Checklist.aspx]. +[AAPOR Standards](https://www-archive.aapor.org/Standards-Ethics/AAPOR-Code-of-Ethics/Survey-Disclosure-Checklist.aspx). ### Documentation {#overview-post-documentation} Documentation is a critical step of the survey life cycle. Researchers systematically record all the details, decisions, procedures, and methodologies to ensure transparency, reproducibility, and the overall quality of survey research. -Proper documentation allows analysts to understand, reproduce, and evaluate the study's methods and findings. Chapter @\ref(c04-understanding-survey-data-documentation) dives into how analysts should use survey data documentation. +Proper documentation allows analysts to understand, reproduce, and evaluate the study's methods and findings. Chapter \@ref(c04-understanding-survey-data-documentation) dives into how analysts should use survey data documentation. ## Post-survey data analysis and reporting diff --git a/03-specifying-sample-designs.Rmd b/03-specifying-sample-designs.Rmd index ee859859..fa4b9e75 100644 --- a/03-specifying-sample-designs.Rmd +++ b/03-specifying-sample-designs.Rmd @@ -1,7 +1,7 @@ # Specifying sample designs and replicate weights in {srvyr} {#c03-specifying-sample-designs} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq3}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} @@ -21,7 +21,7 @@ source("helper-fun/helper-functions.R") To help explain the different types of sample designs, this chapter will use the `api` and `scd` data that comes in the {survey} package: ```{r} -#| label: ch3-setup-surveydata +#| label: samp-setup-surveydata data(api) data(scd) ``` diff --git a/04-understanding-survey-data-documentation.Rmd b/04-understanding-survey-data-documentation.Rmd index c277c305..608df312 100644 --- a/04-understanding-survey-data-documentation.Rmd +++ b/04-understanding-survey-data-documentation.Rmd @@ -1,7 +1,7 @@ # Understanding survey data documentation {#c04-understanding-survey-data-documentation} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq4}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} @@ -50,10 +50,10 @@ A questionnaire is a series of questions asked to obtain information from survey The questionnaire is an essential resource for understanding and interpreting the survey data (see Section \@ref(overview-design-questionnaire)), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (if only a subset of respondents were asked the question). -Below in Figure \@ref(fig:que-examp), we show a question from the ANES 2020 questionnaire [@anes-svy]. This figure shows a particular question's question name (`postvote_rvote`), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if `vote_pre` = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. +Below in Figure \@ref(fig:understand-que-examp), we show a question from the ANES 2020 questionnaire [@anes-svy]. This figure shows a particular question's question name (`postvote_rvote`), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if `vote_pre` = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. ```{r} -#| label: que-examp +#| label: understand-que-examp #| echo: false #| fig.cap: ANES 2020 Questionnaire Example #| fig.alt: Question information about the variable postvote_rvote from ANES 2020 questionnaire Survey question, Universe, Logic, Web Spec, Response Order, and Released Variable are included. @@ -61,10 +61,10 @@ Below in Figure \@ref(fig:que-examp), we show a question from the ANES 2020 ques knitr::include_graphics(path="images/questionnaire-example.jpg") ``` -The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure \@ref(fig:que-examp-2) shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) [@brfss-svy]. +The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure \@ref(fig:understand-que-examp-2) shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) [@brfss-svy]. ```{r} -#| label: que-examp-2 +#| label: understand-que-examp-2 #| echo: false #| fig.cap: BRFSS 2021 Questionnaire Example #| fig.alt: Question information about the variable BPHIGH6 from BRFSS 2021 questionnaire. Question number, question text, variable names, responses, skip info and CATI note, interviewer notes, and columns are included. @@ -78,10 +78,10 @@ Given the variety in how the survey information is presented in documentation, i While a questionnaire provides information about the questions asked to respondents, the codebook explains how the survey data was coded and recorded. The codebook lists details such as variable names, variable labels, variable meanings, codes for missing data, values labels, and value types (whether categorical or continuous, etc.). In particular, the codebook often includes information on missing data (as opposed to the questionnaire). The codebook enables us to understand and use the variables appropriately in our analysis. -Figure \@ref(fig:codebook-examp) is a question from the ANES 2020 codebook [@anes-cb]. This part indicates a particular variable's name (`V202066`), question wording, value labels, universe, and associated survey question (`postvote_rvote`). +Figure \@ref(fig:understand-codebook-examp) is a question from the ANES 2020 codebook [@anes-cb]. This part indicates a particular variable's name (`V202066`), question wording, value labels, universe, and associated survey question (`postvote_rvote`). ```{r} -#| label: codebook-examp +#| label: understand-codebook-examp #| echo: false #| fig.cap: ANES 2020 Codebook Example #| fig.alt: Variable information about the variable V202066 from ANES 2020 questionnaire Variable meaning, Value labels, Universe, and Survey Question(s) are included. @@ -89,7 +89,7 @@ Figure \@ref(fig:codebook-examp) is a question from the ANES 2020 codebook [@ane knitr::include_graphics(path="images/codebook-example.jpg") ``` -Reviewing both questionnaires and codebooks in parallel is important (Figures \@ref(fig:que-examp) and \@ref(fig:codebook-examp), as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables. +Reviewing both questionnaires and codebooks in parallel is important (Figures \@ref(fig:understand-que-examp) and \@ref(fig:understand-codebook-examp), as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables. ### Errata @@ -117,7 +117,7 @@ Missing data can be a significant problem in survey analysis, as it can introduc c. **Missing not at random (MNAR)**: The missing data is related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. -The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have "Yes" responses coded to `1`, "No" responses coded to `2`, and missing responses coded to `-9`. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable `V202066` from the ANES (Figure \@ref(fig:codebook-examp)), `-9` represents "Refused," `-7` means that the response was deleted due to an incomplete interview, `-6` means that there is no response because there was no follow-up interview, and `-1` means "Inapplicable" (due to the designed skip pattern). +The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have "Yes" responses coded to `1`, "No" responses coded to `2`, and missing responses coded to `-9`. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable `V202066` from the ANES (Figure \@ref(fig:understand-codebook-examp)), `-9` represents "Refused," `-7` means that the response was deleted due to an incomplete interview, `-6` means that there is no response because there was no follow-up interview, and `-1` means "Inapplicable" (due to the designed skip pattern). When running analysis in R, we must handle missing responses as missing data (i.e., `NA`) and not numeric data. If missing responses are treated as zeros or arbitrary values, they can artificially alter summary statistics or introduce spurious patterns in the analysis. Recoding these values to `NA` will allow you to handle missing data in different ways in R, such as using functions like `na.omit()`, `complete.cases()`, or specialized packages like {tidyimpute} or {mice}. These tools allow us to treat missing responses as missing data to conduct your analysis accurately and obtain valid results. @@ -136,7 +136,7 @@ Dealing with missing data due to skip patterns requires careful consideration. When dealing with missing data that is MCAR, MAR, or MNAR, we must consider the implications of how we handle these missing data and avoid introducing more sources of bias. For instance, we can analyze only the respondents who answered all questions by performing listwise deletion, which drops all rows from a data frame with a missing value in any column. We can use the function `tidyr::drop_na()` for listwise deletion. For example, let's say we have a dataset `dat` that has one complete case and 2 cases with some missing data. ```{r} -#| label: drop-na-example1 +#| label: understand-dropna-example1 dat <- tibble::tribble(~ col1, ~ col2, ~ col3, "a", "d", "e", "b", NA, NA, @@ -147,7 +147,7 @@ dat If we use the `tidyr::drop_na()` funtion, only the first case will remain as the other two cases have at least one missing value. ```{r} -#| label: drop-na-example2 +#| label: understand-dropna-example2 dat %>% tidyr::drop_na() ``` diff --git a/05-descriptive-analysis.Rmd b/05-descriptive-analysis.Rmd index 35986e8a..921d3328 100644 --- a/05-descriptive-analysis.Rmd +++ b/05-descriptive-analysis.Rmd @@ -1,20 +1,7 @@ # Descriptive analyses in srvyr {#c05-descriptive-analysis} - -```{r} -#| label: desc-summary-tab -#| echo: FALSE -tribble( - ~c1, ~c2, - "**Topic**", "Descriptive analysis of survey data", - "**Purpose**", "purpose-blah", - "**Learning Goals**", "learning-goals-blah" -) %>% - knitr::kable(format="pandoc", col.names=NULL, caption="Summary of Chapter 5") -``` - ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq5}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/06-statistical-testing.Rmd b/06-statistical-testing.Rmd index 7bc4e6c3..c86f9643 100644 --- a/06-statistical-testing.Rmd +++ b/06-statistical-testing.Rmd @@ -1,7 +1,7 @@ # Statistical testing {#c06-statistical-testing} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq6}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/07-modeling.Rmd b/07-modeling.Rmd index 95c5d103..426eb7f7 100644 --- a/07-modeling.Rmd +++ b/07-modeling.Rmd @@ -1,7 +1,7 @@ # Modeling {#c07-modeling} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq7}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/08-communicating-results.Rmd b/08-communicating-results.Rmd index 6a49c8c7..b1b3b8be 100644 --- a/08-communicating-results.Rmd +++ b/08-communicating-results.Rmd @@ -1,7 +1,7 @@ # Communicating Results {#c08-communicating-results} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq8}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/09-ncvs-vignette.Rmd b/09-ncvs-vignette.Rmd index 59ec293a..050af608 100644 --- a/09-ncvs-vignette.Rmd +++ b/09-ncvs-vignette.Rmd @@ -1,7 +1,7 @@ # National Crime Victimization Survey Vignette {#c09-ncvs-vignette} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq9}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/10-ambarom-vignette.Rmd b/10-ambarom-vignette.Rmd index bdd5a6a9..884d74eb 100644 --- a/10-ambarom-vignette.Rmd +++ b/10-ambarom-vignette.Rmd @@ -1,7 +1,7 @@ # AmericasBarometer Vignette {#c10-ambarom-vignette} ::: {.prereqbox-header} -`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq}'` +`r if (knitr:::is_html_output()) '### Prerequisites {- #prereq10}'` ::: ::: {.prereqbox data-latex="{Prerequisites}"} diff --git a/book.bib b/book.bib index bd5053c0..9ffcac82 100644 --- a/book.bib +++ b/book.bib @@ -244,14 +244,13 @@ @misc{acs-5yr-doc } @article{tse-doc, author = {Biemer, Paul P.}, - title = "{Total Survey Error: Design, Implementation, and Evaluation}", + title = {Total Survey Error: Design, Implementation, and Evaluation}, journal = {Public Opinion Quarterly}, volume = {74}, number = {5}, pages = {817-848}, year = {2010}, month = {01}, - abstract = "{The total survey error (TSE) paradigm provides a theoretical framework for optimizing surveys by maximizing data quality within budgetary constraints. In this article, the TSE paradigm is viewed as part of a much larger design strategy that seeks to optimize surveys by maximizing total survey quality; i.e., quality more broadly defined to include user-specified dimensions of quality. Survey methodology, viewed within this larger framework, alters our perspectives on the survey design, implementation, and evaluation. As an example, although a major objective of survey design is to maximize accuracy subject to costs and timeliness constraints, the survey budget must also accommodate additional objectives related to relevance, accessibility, interpretability, comparability, coherence, and completeness that are critical to a survey's “fitness for use.” The article considers how the total survey quality approach can be extended beyond survey design to include survey implementation and evaluation. In doing so, the “fitness for use” perspective is shown to influence decisions regarding how to reduce survey error during design implementation and what sources of error should be evaluated in order to assess the survey quality today and to prepare for the surveys of the future.}", issn = {0033-362X}, doi = {10.1093/poq/nfq058}, url = {https://doi.org/10.1093/poq/nfq058}, @@ -271,7 +270,7 @@ @book{groves2009survey } @book{biemer2003survqual, title = {Introduction to survey quality}, - author = {Biemer, Paul P and Lyberg, Lars E}, + author = {Biemer, Paul P. and Lyberg, Lars E.}, year = 2003, publisher = {John Wiley \& Sons} } @@ -297,15 +296,84 @@ @article{DeLeeuw_2018 } @article{biemer_choiceplus, title = {{Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys}}, - author = {Biemer, Paul P and Murphy, Joe and Zimmer, Stephanie and Berry, Chip and Deng, Grace and Lewis, Katie}, + author = {Biemer, Paul P. and Murphy, Joe and Zimmer, Stephanie and Berry, Chip and Deng, Grace and Lewis, Katie}, year = 2017, month = {06}, journal = {Journal of Survey Statistics and Methodology}, volume = 6, number = 2, - pages = {240--261}, + pages = {240-261}, doi = {10.1093/jssam/smx015}, issn = {2325-0984}, url = {https://doi.org/10.1093/jssam/smx015}, eprint = {https://academic.oup.com/jssam/article-pdf/6/2/240/24807375/smx015.pdf} -} \ No newline at end of file +} +@book{Bradburn2004, + author = {Norman M. Bradburn and Seymour Sudman and Brian Wansink}, + edition = {2nd Edition}, + publisher = {Jossey-Bass}, + title = {Asking Questions: The Definitive Guide to Questionnaire Design}, + year = {2004}, +} +@book{Fowler1989, + author = {Floyd J Fowler and Thomas W. Mangione}, + publisher = {SAGE}, + title = {Standardized Survey Interviewing}, + year = {1989}, +} +@book{Kim2021, + author = {Jae Kwang Kim and Jun Shao}, + publisher = {Chapman \& Hall/CRC Press}, + title = {Statistical Methods for Handling Incomplete Data}, + year = {2021}, +} +@book{Schouten2018, + author = {Barry Schouten and Andy Peytchev and James Wagner}, + publisher = {Chapman \& Hall/CRC Press}, + title = {Adaptive Survey Design}, + year = {2018}, +} +@book{Tourangeau2000psych, + author = {Roger Tourangeau and Lance J. Rips and Kenneth Rasinski}, + publisher = {Cambridge University Press}, + title = {Psychology of Survey Response}, + year = {2000}, +} +@article{Tourangeau2004spacing, + author = {Roger Tourangeau and Mick P. Couper and Frederick Conrad}, + isbn = {0033-362X}, + issn = {0033362X}, + issue = {3}, + journal = {Public Opinion Quarterly}, + pages = {368-393}, + publisher = {Oxford University Press}, + title = {Sapcing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions}, + volume = {68}, + url = {http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp}, + year = {2004}, +} +@book{Valliant2018weights, + author = {Richard Valliant and Jill A. Dever}, + publisher = {Stata Press}, + title = {Survey Weights: A Step-by-step Guide to Calculation}, + year = {2018}, +} +@article{deLeeuw2005, + author = {DeLeeuw, Edith D.}, + issue = {2}, + journal = {Journal of Official Statistics}, + pages = {233-255}, + title = {To Mix or Not to Mix Data Collection Modes in Surveys}, + volume = {21}, + year = {2005}, +} + +@inbook{Skinner2009, + author = {Chris Skinner}, + editor = {C.R. Rao}, + title = {Chapter 15: Statistical Disclosure Control for Survey Data}, + booktitle = {Handbook of Statistics: Sample Surveys: Design, Methods and Applications}, + pages = {381-396}, + publisher = {Elsevier B.V.}, + year = {2009}, +} diff --git a/css/style.css b/css/style.css index 977db557..3320a86a 100644 --- a/css/style.css +++ b/css/style.css @@ -55,7 +55,7 @@ li.ro::marker{ border-top-right-radius: 10px; } -h3.hasAnchor#prereq { +h3.hasAnchor#prereq3 #prereq4 #prereq5 #prereq6 #prereq7 #prereq8 #prereq9 #prereq10 { margin-top: 0em !important; margin-bottom: 0em !important; }