diff --git a/DESCRIPTION b/DESCRIPTION index b8a6c3134..720c5afe1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.21 +Version: 0.13.0.22 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index a693a4722..57d2ddfe5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,19 +16,29 @@ BREAKING CHANGES AND DEPRECATIONS - if `select` (previously `pattern`) is a named vector, then all elements must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors. +* Order of arguments `by` and `probability_weights` in `rescale_weights()` has + changed, because for `method = "kish"`, the `by` argument is optional. + +* The name of the rescaled weights variables in `rescale_weights()` have been + renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a` + and `rescaled_weights_b`. + * `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when length of `by` was > 1) were revised. Now, an integrated table instead of multiple tables is returned. Furthermore, `print_html()` did not work, which was also fixed now. * `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`, - to append the centered variabled to the original data frame, instead of + to append the centered variables to the original data frame, instead of returning the de- and group-meaned variables only. Use `append = FALSE` to for the previous default behaviour (i.e. only returning the newly created variables). CHANGES +* `rescale_weights()` gets a `method` argument, to choose method to rescale + weights. Options are `"carle"` (the default) and `"kish"`. + * The `select` argument, which is available in different functions to select variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). diff --git a/R/data_arrange.R b/R/data_arrange.R index 0cff97bd8..07a7f5f56 100644 --- a/R/data_arrange.R +++ b/R/data_arrange.R @@ -42,7 +42,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { select <- gsub("^-", "", select) # check for variables that are not in data - dont_exist <- select[which(!select %in% names(data))] + dont_exist <- setdiff(select, colnames(data)) if (length(dont_exist) > 0) { if (safe) { diff --git a/R/rescale_weights.R b/R/rescale_weights.R index ec0c75616..35e1bba98 100644 --- a/R/rescale_weights.R +++ b/R/rescale_weights.R @@ -2,96 +2,186 @@ #' @name rescale_weights #' #' @description Most functions to fit multilevel and mixed effects models only -#' allow to specify frequency weights, but not design (i.e. sampling or -#' probability) weights, which should be used when analyzing complex samples -#' and survey data. `rescale_weights()` implements an algorithm proposed -#' by \cite{Asparouhov (2006)} and \cite{Carle (2009)} to rescale design -#' weights in survey data to account for the grouping structure of multilevel -#' models, which then can be used for multilevel modelling. -#' +#' allow the user to specify frequency weights, but not design (i.e., sampling +#' or probability) weights, which should be used when analyzing complex samples +#' (e.g., probability samples). `rescale_weights()` implements two algorithms, +#' one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)} and one +#' proposed by by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale +#' design weights in survey data to account for the grouping structure of +#' multilevel models, and and one based on the design effect proposed by +#' \cite{Kish (1965)}, to rescale weights by the design effect to account for +#' additional sampling error introduced by weighting. #' @param data A data frame. #' @param by Variable names (as character vector, or as formula), indicating -#' the grouping structure (strata) of the survey data (level-2-cluster -#' variable). It is also possible to create weights for multiple group -#' variables; in such cases, each created weighting variable will be suffixed -#' by the name of the group variable. +#' the grouping structure (strata) of the survey data (level-2-cluster +#' variable). It is also possible to create weights for multiple group +#' variables; in such cases, each created weighting variable will be suffixed +#' by the name of the group variable. This argument is required for +#' `method = "carle"`, but optional for `method = "kish"`. #' @param probability_weights Variable indicating the probability (design or -#' sampling) weights of the survey data (level-1-weight). -#' @param nest Logical, if `TRUE` and `by` indicates at least two -#' group variables, then groups are "nested", i.e. groups are now a -#' combination from each group level of the variables in `by`. +#' sampling) weights of the survey data (level-1-weight), provided as character +#' string or formula. +#' @param nest Logical, if `TRUE` and `by` indicates at least two group +#' variables, then groups are "nested", i.e. groups are now a combination from +#' each group level of the variables in `by`. This argument is not used when +#' `method = "kish"`. +#' @param method String, indicating which rescale-method is used for rescaling +#' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'. If +#' `method = "carle"`, the `by` argument is required. #' -#' @return `data`, including the new weighting variables: `pweights_a` -#' and `pweights_b`, which represent the rescaled design weights to use -#' in multilevel models (use these variables for the `weights` argument). +#' @return +#' `data`, including the new weighting variable(s). For `method = "carle"`, new +#' columns `rescaled_weights_a` and `rescaled_weights_b` are returned, and for +#' `method = "kish"`, the returned data contains a column `rescaled_weights`. +#' These represent the rescaled design weights to use in multilevel models (use +#' these variables for the `weights` argument). #' #' @details +#' - `method = "carle"` +#' +#' Rescaling is based on two methods: For `rescaled_weights_a`, the sample +#' weights `probability_weights` are adjusted by a factor that represents the +#' proportion of group size divided by the sum of sampling weights within each +#' group. The adjustment factor for `rescaled_weights_b` is the sum of sample +#' weights within each group divided by the sum of squared sample weights +#' within each group (see Carle (2009), Appendix B). In other words, +#' `rescaled_weights_a` "scales the weights so that the new weights sum to the +#' cluster sample size" while `rescaled_weights_b` "scales the weights so that +#' the new weights sum to the effective cluster size". +#' +#' Regarding the choice between scaling methods A and B, Carle suggests that +#' "analysts who wish to discuss point estimates should report results based +#' on weighting method A. For analysts more interested in residual +#' between-group variance, method B may generally provide the least biased +#' estimates". In general, it is recommended to fit a non-weighted model and +#' weighted models with both scaling methods and when comparing the models, +#' see whether the "inferential decisions converge", to gain confidence in the +#' results. +#' +#' Though the bias of scaled weights decreases with increasing group size, +#' method A is preferred when insufficient or low group size is a concern. +#' +#' The group ID and probably PSU may be used as random effects (e.g. nested +#' design, or group and PSU as varying intercepts), depending on the survey +#' design that should be mimicked. +#' +#' - `method = "kish"` +#' +#' Rescaling is based on scaling the sample weights so the mean value is 1, +#' which means the sum of all weights equals the sample size. Next, the design +#' effect (_Kish 1965_) is calculated, which is the mean of the squared +#' weights divided by the squared mean of the weights. The scaled sample +#' weights are then divided by the design effect. This method is most +#' appropriate when weights are based on additional variables beyond the +#' grouping variables in the model (e.g., other demographic characteristics), +#' but may also be useful in other contexts. #' -#' Rescaling is based on two methods: For `pweights_a`, the sample weights -#' `probability_weights` are adjusted by a factor that represents the proportion -#' of group size divided by the sum of sampling weights within each group. The -#' adjustment factor for `pweights_b` is the sum of sample weights within each -#' group divided by the sum of squared sample weights within each group (see -#' Carle (2009), Appendix B). In other words, `pweights_a` "scales the weights -#' so that the new weights sum to the cluster sample size" while `pweights_b` -#' "scales the weights so that the new weights sum to the effective cluster -#' size". -#' -#' Regarding the choice between scaling methods A and B, Carle suggests that -#' "analysts who wish to discuss point estimates should report results based on -#' weighting method A. For analysts more interested in residual between-group -#' variance, method B may generally provide the least biased estimates". In -#' general, it is recommended to fit a non-weighted model and weighted models -#' with both scaling methods and when comparing the models, see whether the -#' "inferential decisions converge", to gain confidence in the results. -#' -#' Though the bias of scaled weights decreases with increasing group size, -#' method A is preferred when insufficient or low group size is a concern. -#' -#' The group ID and probably PSU may be used as random effects (e.g. nested -#' design, or group and PSU as varying intercepts), depending on the survey -#' design that should be mimicked. +#' Some tests on real-world survey-data suggest that, in comparison to the +#' Carle-method, the Kish-method comes closer to estimates from a regular +#' survey-design using the **survey** package. Note that these tests are not +#' representative and it is recommended to check your results against a +#' standard survey-design. #' #' @references +#' - Asparouhov T. (2006). General Multi-Level Modeling with Sampling +#' Weights. Communications in Statistics - Theory and Methods 35: 439-460 +#' #' - Carle A.C. (2009). Fitting multilevel models in complex survey data #' with design weights: Recommendations. BMC Medical Research Methodology #' 9(49): 1-13 #' -#' - Asparouhov T. (2006). General Multi-Level Modeling with Sampling -#' Weights. Communications in Statistics - Theory and Methods 35: 439-460 +#' - Kish, L. (1965) Survey Sampling. London: Wiley. +#' +#' @examplesIf all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE)) +#' data(nhanes_sample) +#' head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) +#' +#' # also works with multiple group-variables +#' head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) #' -#' @examples -#' if (require("lme4")) { -#' data(nhanes_sample) -#' head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")) -#' -#' # also works with multiple group-variables -#' head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR")) -#' -#' # or nested structures. -#' x <- rescale_weights( -#' data = nhanes_sample, -#' by = c("SDMVSTRA", "SDMVPSU"), -#' probability_weights = "WTINT2YR", -#' nest = TRUE -#' ) -#' head(x) -#' -#' nhanes_sample <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") -#' -#' glmer( -#' total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU), -#' family = poisson(), -#' data = nhanes_sample, -#' weights = pweights_a -#' ) +#' # or nested structures. +#' x <- rescale_weights( +#' data = nhanes_sample, +#' probability_weights = "WTINT2YR", +#' by = c("SDMVSTRA", "SDMVPSU"), +#' nest = TRUE +#' ) +#' head(x) +#' +#' \donttest{ +#' # compare different methods, using multilevel-Poisson regression +#' +#' d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") +#' result1 <- lme4::glmer( +#' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), +#' family = poisson(), +#' data = d, +#' weights = rescaled_weights_a +#' ) +#' result2 <- lme4::glmer( +#' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), +#' family = poisson(), +#' data = d, +#' weights = rescaled_weights_b +#' ) +#' +#' d <- rescale_weights( +#' nhanes_sample, +#' "WTINT2YR", +#' method = "kish" +#' ) +#' result3 <- lme4::glmer( +#' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), +#' family = poisson(), +#' data = d, +#' weights = rescaled_weights +#' ) +#' d <- rescale_weights( +#' nhanes_sample, +#' "WTINT2YR", +#' "SDMVSTRA", +#' method = "kish" +#' ) +#' result4 <- lme4::glmer( +#' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), +#' family = poisson(), +#' data = d, +#' weights = rescaled_weights +#' ) +#' parameters::compare_parameters( +#' list(result1, result2, result3, result4), +#' exponentiate = TRUE, +#' column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)") +#' ) #' } #' @export -rescale_weights <- function(data, by, probability_weights, nest = FALSE) { +rescale_weights <- function(data, + probability_weights = NULL, + by = NULL, + nest = FALSE, + method = "carle") { + method <- insight::validate_argument(method, c("carle", "kish")) + + # convert formulas to strings if (inherits(by, "formula")) { by <- all.vars(by) } + if (inherits(probability_weights, "formula")) { + probability_weights <- all.vars(probability_weights) + } + + # check for existing variable names + if ((method == "carle" && any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) || + (method == "kish" && "rescaled_weights" %in% colnames(data))) { + insight::format_warning("The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names.") # nolint + } + + # need probability_weights + if (is.null(probability_weights)) { + insight::format_error("The argument `probability_weights` is missing, but required to rescale weights.") + } + # check if weight has missings. we need to remove them first, # and add back weights to correct cases later @@ -104,9 +194,96 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) { data_tmp <- data } + fun_args <- list( + nest = nest, + probability_weights = probability_weights, + data_tmp = data_tmp, + data = data, + by = by, + weight_non_na = weight_non_na + ) + + switch(method, + carle = do.call(.rescale_weights_carle, fun_args), + do.call(.rescale_weights_kish, fun_args) + ) +} + + +# rescale weights, method Kish ---------------------------- + +.rescale_weights_kish <- function(nest, probability_weights, data_tmp, data, by, weight_non_na) { # sort id data_tmp$.bamboozled <- seq_len(nrow(data_tmp)) + # `nest` is currently ignored + if (isTRUE(nest)) { + insight::format_warning("Argument `nest` is ignored for `method = \"kish\"`.") + } + + # check by argument + if (!is.null(by) && !all(by %in% colnames(data_tmp))) { + dont_exist <- setdiff(by, colnames(data_tmp)) + insight::format_error( + paste0( + "The following variable(s) specified in `by` don't exist in the dataset: ", + text_concatenate(dont_exist), "." + ), + .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?") + ) + } else if (is.null(by)) { + # if `by` = NULL, we create a dummy group + by <- "tmp_kish_by" + data_tmp[[by]] <- 1 + } + + # split into groups, and calculate weights + out <- lapply(split(data_tmp, data_tmp[by]), function(group_data) { + p_weights <- group_data[[probability_weights]] + # design effect according to Kish + deff <- mean(p_weights^2) / (mean(p_weights)^2) + # rescale weights, so their mean is 1 + z_weights <- p_weights * (1 / mean(p_weights)) + # divide weights by design effect + group_data$rescaled_weights <- z_weights / deff + group_data + }) + + # bind data + result <- do.call(rbind, out) + + # restore original order + result <- result[order(result$.bamboozled), ] + + # add back rescaled weights to original data, but account for missing observations + data$rescaled_weights <- NA_real_ + data$rescaled_weights[weight_non_na] <- result$rescaled_weights + # return result + data +} + + +# rescale weights, method Carle ---------------------------- + +.rescale_weights_carle <- function(nest, probability_weights, data_tmp, data, by, weight_non_na) { + # sort id + data_tmp$.bamboozled <- seq_len(nrow(data_tmp)) + + if (is.null(by)) { + insight::format_error("Argument `by` must be specified. Please provide one or more variable names in `by` that indicate the grouping structure (strata) of the survey data (level-2-cluster variable).") # nolint + } + + if (!all(by %in% colnames(data_tmp))) { + dont_exist <- setdiff(by, colnames(data_tmp)) + insight::format_error( + paste0( + "The following variable(s) specified in `by` don't exist in the dataset: ", + text_concatenate(dont_exist), "." + ), + .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?") + ) + } + if (nest && length(by) < 2) { insight::format_warning( sprintf( @@ -129,7 +306,15 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) { }) } - do.call(cbind, list(data, out)) + make_unique_names <- any(vapply(out, function(i) any(colnames(i) %in% colnames(data)), logical(1))) + # add weights to data frame + out <- do.call(cbind, list(data, out)) + # check if we have to rename columns + if (make_unique_names) { + colnames(out) <- make.unique(colnames(out), sep = "_") + } + + out } @@ -158,12 +343,12 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) { w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( - pweights_a = rep(NA_real_, times = n), - pweights_b = rep(NA_real_, times = n) + rescaled_weights_a = rep(NA_real_, times = n), + rescaled_weights_b = rep(NA_real_, times = n) ) - out$pweights_a[weight_non_na] <- w_a - out$pweights_b[weight_non_na] <- w_b + out$rescaled_weights_a[weight_non_na] <- w_a + out$rescaled_weights_b[weight_non_na] <- w_b out } @@ -206,12 +391,12 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) { w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( - pweights_a = rep(NA_real_, times = n), - pweights_b = rep(NA_real_, times = n) + rescaled_weights_a = rep(NA_real_, times = n), + rescaled_weights_b = rep(NA_real_, times = n) ) - out$pweights_a[weight_non_na] <- w_a - out$pweights_b[weight_non_na] <- w_b + out$rescaled_weights_a[weight_non_na] <- w_a + out$rescaled_weights_b[weight_non_na] <- w_b out } diff --git a/R/select_nse.R b/R/select_nse.R index aa9632eb0..06ddc9bbc 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -198,11 +198,13 @@ } # small helper, to avoid duplicated code + .action_if_not_found <- function(x, columns, matches, verbose, ifnotfound) { + msg <- paste0( "Following variable(s) were not found: ", toString(x[is.na(matches)]) diff --git a/inst/WORDLIST b/inst/WORDLIST index a8b4ff08d..176d19b6a 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -26,6 +26,7 @@ Heisig Herrington Hoffmann Joanes +Kish Llabre Lumley MADs diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd index d9651decb..85ffae0b9 100644 --- a/man/rescale_weights.Rd +++ b/man/rescale_weights.Rd @@ -4,56 +4,78 @@ \alias{rescale_weights} \title{Rescale design weights for multilevel analysis} \usage{ -rescale_weights(data, by, probability_weights, nest = FALSE) +rescale_weights( + data, + probability_weights = NULL, + by = NULL, + nest = FALSE, + method = "carle" +) } \arguments{ \item{data}{A data frame.} +\item{probability_weights}{Variable indicating the probability (design or +sampling) weights of the survey data (level-1-weight), provided as character +string or formula.} + \item{by}{Variable names (as character vector, or as formula), indicating the grouping structure (strata) of the survey data (level-2-cluster variable). It is also possible to create weights for multiple group variables; in such cases, each created weighting variable will be suffixed -by the name of the group variable.} +by the name of the group variable. This argument is required for +\code{method = "carle"}, but optional for \code{method = "kish"}.} -\item{probability_weights}{Variable indicating the probability (design or -sampling) weights of the survey data (level-1-weight).} +\item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group +variables, then groups are "nested", i.e. groups are now a combination from +each group level of the variables in \code{by}. This argument is not used when +\code{method = "kish"}.} -\item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two -group variables, then groups are "nested", i.e. groups are now a -combination from each group level of the variables in \code{by}.} +\item{method}{String, indicating which rescale-method is used for rescaling +weights. Can be either \code{"carle"} (default) or \code{"kish"}. See 'Details'. If +\code{method = "carle"}, the \code{by} argument is required.} } \value{ -\code{data}, including the new weighting variables: \code{pweights_a} -and \code{pweights_b}, which represent the rescaled design weights to use -in multilevel models (use these variables for the \code{weights} argument). +\code{data}, including the new weighting variable(s). For \code{method = "carle"}, new +columns \code{rescaled_weights_a} and \code{rescaled_weights_b} are returned, and for +\code{method = "kish"}, the returned data contains a column \code{rescaled_weights}. +These represent the rescaled design weights to use in multilevel models (use +these variables for the \code{weights} argument). } \description{ Most functions to fit multilevel and mixed effects models only -allow to specify frequency weights, but not design (i.e. sampling or -probability) weights, which should be used when analyzing complex samples -and survey data. \code{rescale_weights()} implements an algorithm proposed -by \cite{Asparouhov (2006)} and \cite{Carle (2009)} to rescale design -weights in survey data to account for the grouping structure of multilevel -models, which then can be used for multilevel modelling. +allow the user to specify frequency weights, but not design (i.e., sampling +or probability) weights, which should be used when analyzing complex samples +(e.g., probability samples). \code{rescale_weights()} implements two algorithms, +one proposed by \cite{Asparouhov (2006)} and \cite{Carle (2009)} and one +proposed by by \cite{Asparouhov (2006)} and \cite{Carle (2009)}, to rescale +design weights in survey data to account for the grouping structure of +multilevel models, and and one based on the design effect proposed by +\cite{Kish (1965)}, to rescale weights by the design effect to account for +additional sampling error introduced by weighting. } \details{ -Rescaling is based on two methods: For \code{pweights_a}, the sample weights -\code{probability_weights} are adjusted by a factor that represents the proportion -of group size divided by the sum of sampling weights within each group. The -adjustment factor for \code{pweights_b} is the sum of sample weights within each -group divided by the sum of squared sample weights within each group (see -Carle (2009), Appendix B). In other words, \code{pweights_a} "scales the weights -so that the new weights sum to the cluster sample size" while \code{pweights_b} -"scales the weights so that the new weights sum to the effective cluster -size". +\itemize{ +\item \code{method = "carle"} + +Rescaling is based on two methods: For \code{rescaled_weights_a}, the sample +weights \code{probability_weights} are adjusted by a factor that represents the +proportion of group size divided by the sum of sampling weights within each +group. The adjustment factor for \code{rescaled_weights_b} is the sum of sample +weights within each group divided by the sum of squared sample weights +within each group (see Carle (2009), Appendix B). In other words, +\code{rescaled_weights_a} "scales the weights so that the new weights sum to the +cluster sample size" while \code{rescaled_weights_b} "scales the weights so that +the new weights sum to the effective cluster size". Regarding the choice between scaling methods A and B, Carle suggests that -"analysts who wish to discuss point estimates should report results based on -weighting method A. For analysts more interested in residual between-group -variance, method B may generally provide the least biased estimates". In -general, it is recommended to fit a non-weighted model and weighted models -with both scaling methods and when comparing the models, see whether the -"inferential decisions converge", to gain confidence in the results. +"analysts who wish to discuss point estimates should report results based +on weighting method A. For analysts more interested in residual +between-group variance, method B may generally provide the least biased +estimates". In general, it is recommended to fit a non-weighted model and +weighted models with both scaling methods and when comparing the models, +see whether the "inferential decisions converge", to gain confidence in the +results. Though the bias of scaled weights decreases with increasing group size, method A is preferred when insufficient or low group size is a concern. @@ -61,40 +83,96 @@ method A is preferred when insufficient or low group size is a concern. The group ID and probably PSU may be used as random effects (e.g. nested design, or group and PSU as varying intercepts), depending on the survey design that should be mimicked. +\item \code{method = "kish"} + +Rescaling is based on scaling the sample weights so the mean value is 1, +which means the sum of all weights equals the sample size. Next, the design +effect (\emph{Kish 1965}) is calculated, which is the mean of the squared +weights divided by the squared mean of the weights. The scaled sample +weights are then divided by the design effect. This method is most +appropriate when weights are based on additional variables beyond the +grouping variables in the model (e.g., other demographic characteristics), +but may also be useful in other contexts. + +Some tests on real-world survey-data suggest that, in comparison to the +Carle-method, the Kish-method comes closer to estimates from a regular +survey-design using the \strong{survey} package. Note that these tests are not +representative and it is recommended to check your results against a +standard survey-design. +} } \examples{ -if (require("lme4")) { - data(nhanes_sample) - head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")) +\dontshow{if (all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +data(nhanes_sample) +head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) + +# also works with multiple group-variables +head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) - # also works with multiple group-variables - head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR")) +# or nested structures. +x <- rescale_weights( + data = nhanes_sample, + probability_weights = "WTINT2YR", + by = c("SDMVSTRA", "SDMVPSU"), + nest = TRUE +) +head(x) - # or nested structures. - x <- rescale_weights( - data = nhanes_sample, - by = c("SDMVSTRA", "SDMVPSU"), - probability_weights = "WTINT2YR", - nest = TRUE - ) - head(x) +\donttest{ +# compare different methods, using multilevel-Poisson regression - nhanes_sample <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") +d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") +result1 <- lme4::glmer( + total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), + family = poisson(), + data = d, + weights = rescaled_weights_a +) +result2 <- lme4::glmer( + total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), + family = poisson(), + data = d, + weights = rescaled_weights_b +) - glmer( - total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU), - family = poisson(), - data = nhanes_sample, - weights = pweights_a - ) +d <- rescale_weights( + nhanes_sample, + "WTINT2YR", + method = "kish" +) +result3 <- lme4::glmer( + total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), + family = poisson(), + data = d, + weights = rescaled_weights +) +d <- rescale_weights( + nhanes_sample, + "WTINT2YR", + "SDMVSTRA", + method = "kish" +) +result4 <- lme4::glmer( + total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), + family = poisson(), + data = d, + weights = rescaled_weights +) +parameters::compare_parameters( + list(result1, result2, result3, result4), + exponentiate = TRUE, + column_names = c("Carle (A)", "Carle (B)", "Kish", "Kish (grouped)") +) } +\dontshow{\}) # examplesIf} } \references{ \itemize{ +\item Asparouhov T. (2006). General Multi-Level Modeling with Sampling +Weights. Communications in Statistics - Theory and Methods 35: 439-460 \item Carle A.C. (2009). Fitting multilevel models in complex survey data with design weights: Recommendations. BMC Medical Research Methodology 9(49): 1-13 -\item Asparouhov T. (2006). General Multi-Level Modeling with Sampling -Weights. Communications in Statistics - Theory and Methods 35: 439-460 +\item Kish, L. (1965) Survey Sampling. London: Wiley. } } diff --git a/tests/testthat/_snaps/rescale_weights.md b/tests/testthat/_snaps/rescale_weights.md index 5de6d489a..f4a18f3bf 100644 --- a/tests/testthat/_snaps/rescale_weights.md +++ b/tests/testthat/_snaps/rescale_weights.md @@ -1,20 +1,27 @@ # rescale_weights works as expected Code - head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")) + head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) Output - total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights_a pweights_b - 1 1 2.20 1 3 2 31 97593.68 1.5733612 1.2005159 - 2 7 2.08 2 3 1 29 39599.36 0.6231745 0.5246593 - 3 3 1.48 2 1 2 42 26619.83 0.8976966 0.5439111 - 4 4 1.32 2 4 2 33 34998.53 0.7083628 0.5498944 - 5 1 2.00 2 1 1 41 14746.45 0.4217782 0.3119698 - 6 6 2.20 2 4 1 38 28232.10 0.6877550 0.5155503 + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a + 1 1 2.20 1 3 2 31 97593.68 1.5733612 + 2 7 2.08 2 3 1 29 39599.36 0.6231745 + 3 3 1.48 2 1 2 42 26619.83 0.8976966 + 4 4 1.32 2 4 2 33 34998.53 0.7083628 + 5 1 2.00 2 1 1 41 14746.45 0.4217782 + 6 6 2.20 2 4 1 38 28232.10 0.6877550 + rescaled_weights_b + 1 1.2005159 + 2 0.5246593 + 3 0.5439111 + 4 0.5498944 + 5 0.3119698 + 6 0.5155503 --- Code - head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR")) + head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU"))) Output total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweight_a_SDMVSTRA 1 1 2.20 1 3 2 31 97593.68 1.5733612 @@ -31,72 +38,160 @@ 5 0.3119698 0.3060151 0.2152722 6 0.5155503 0.5858662 0.4121388 +--- + + Code + head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish")) + Output + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights + 1 1 2.20 1 3 2 31 97593.68 1.3952529 + 2 7 2.08 2 3 1 29 39599.36 0.5661343 + 3 3 1.48 2 1 2 42 26619.83 0.3805718 + 4 4 1.32 2 4 2 33 34998.53 0.5003582 + 5 1 2.00 2 1 1 41 14746.45 0.2108234 + 6 6 2.20 2 4 1 38 28232.10 0.4036216 + +--- + + Code + rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") + Output + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a + 1 1 2.20 1 3 2 31 97593.68 1.0000000 + 2 7 2.08 2 3 1 29 39599.36 0.5819119 + 3 3 1.48 2 1 2 42 NA NA + 4 4 1.32 2 4 2 33 34998.53 0.6766764 + 5 1 2.00 2 1 1 41 14746.45 0.7471696 + 6 6 2.20 2 4 1 38 28232.10 1.0000000 + 7 350 1.60 1 3 2 33 93162.43 1.8012419 + 8 NA 1.48 2 3 1 29 82275.99 1.2090441 + 9 3 2.28 2 4 1 41 24726.39 1.2528304 + 10 30 0.84 1 3 2 35 NA NA + 11 70 1.24 1 4 2 33 27002.70 0.5220817 + 12 5 1.68 2 1 2 39 18792.03 1.0000000 + 13 60 2.20 1 3 2 30 76894.56 1.0000000 + 14 2 1.48 2 3 1 29 NA NA + 15 8 2.36 2 3 2 39 NA NA + 16 3 2.04 2 3 2 36 98200.91 1.0000000 + 17 1 2.08 1 3 1 40 87786.09 1.0000000 + 18 7 1.00 1 3 2 32 90803.16 1.0000000 + 19 9 2.28 2 3 2 34 NA NA + 20 2 1.24 2 3 1 29 82275.99 1.2090441 + rescaled_weights_b + 1 1.0000000 + 2 0.5351412 + 3 NA + 4 0.5107078 + 5 0.7022777 + 6 1.0000000 + 7 1.3594509 + 8 1.1118681 + 9 1.1775572 + 10 NA + 11 0.3940306 + 12 1.0000000 + 13 1.0000000 + 14 NA + 15 NA + 16 1.0000000 + 17 1.0000000 + 18 1.0000000 + 19 NA + 20 1.1118681 + +--- + + Code + rescale_weights(nhanes_sample, "WTINT2YR", method = "kish") + Output + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights + 1 1 2.20 1 3 2 31 97593.68 1.2734329 + 2 7 2.08 2 3 1 29 39599.36 0.5167049 + 3 3 1.48 2 1 2 42 NA NA + 4 4 1.32 2 4 2 33 34998.53 0.4566718 + 5 1 2.00 2 1 1 41 14746.45 0.1924164 + 6 6 2.20 2 4 1 38 28232.10 0.3683813 + 7 350 1.60 1 3 2 33 93162.43 1.2156126 + 8 NA 1.48 2 3 1 29 82275.99 1.0735629 + 9 3 2.28 2 4 1 41 24726.39 0.3226377 + 10 30 0.84 1 3 2 35 NA NA + 11 70 1.24 1 4 2 33 27002.70 0.3523397 + 12 5 1.68 2 1 2 39 18792.03 0.2452044 + 13 60 2.20 1 3 2 30 76894.56 1.0033444 + 14 2 1.48 2 3 1 29 NA NA + 15 8 2.36 2 3 2 39 NA NA + 16 3 2.04 2 3 2 36 98200.91 1.2813563 + 17 1 2.08 1 3 1 40 87786.09 1.1454605 + 18 7 1.00 1 3 2 32 90803.16 1.1848281 + 19 9 2.28 2 3 2 34 NA NA + 20 2 1.24 2 3 1 29 82275.99 1.0735629 + # rescale_weights nested works as expected Code rescale_weights(data = head(nhanes_sample, n = 30), by = c("SDMVSTRA", "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE) Output - total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights_a - 1 1 2.20 1 3 2 31 97593.679 1.0000000 - 2 7 2.08 2 3 1 29 39599.363 0.5502486 - 3 3 1.48 2 1 2 42 26619.834 0.9512543 - 4 4 1.32 2 4 2 33 34998.530 0.6766764 - 5 1 2.00 2 1 1 41 14746.454 0.7147710 - 6 6 2.20 2 4 1 38 28232.100 1.0000000 - 7 350 1.60 1 3 2 33 93162.431 1.8012419 - 8 NA 1.48 2 3 1 29 82275.986 1.1432570 - 9 3 2.28 2 4 1 41 24726.391 1.1985056 - 10 30 0.84 1 3 2 35 39895.048 1.0000000 - 11 70 1.24 1 4 2 33 27002.703 0.5220817 - 12 5 1.68 2 1 2 39 18792.034 0.3866720 - 13 60 2.20 1 3 2 30 76894.563 1.0000000 - 14 2 1.48 2 3 1 29 82275.986 1.1432570 - 15 8 2.36 2 3 2 39 78406.811 1.6133280 - 16 3 2.04 2 3 2 36 98200.912 1.0000000 - 17 1 2.08 1 3 1 40 87786.091 1.0000000 - 18 7 1.00 1 3 2 32 90803.158 1.2693642 - 19 9 2.28 2 3 2 34 45002.917 1.0000000 - 20 2 1.24 2 3 1 29 82275.986 1.1432570 - 21 4 2.28 2 3 1 34 91437.145 1.4088525 - 22 3 1.04 1 1 2 42 29348.027 1.0487457 - 23 4 1.12 1 1 1 34 38366.567 0.5911475 - 24 1 1.52 2 1 1 42 6622.334 1.0000000 - 25 22 2.24 1 4 1 41 22420.209 1.0867233 - 26 7 1.00 2 3 2 41 65529.204 1.0000000 - 27 5 0.92 2 4 1 30 27089.745 1.0000000 - 28 15 1.04 1 3 2 32 52265.570 0.7306358 - 29 3 0.80 1 3 1 33 64789.307 1.0000000 - 30 1 1.00 1 3 1 29 73404.222 1.0199804 - pweights_b - 1 1.0000000 - 2 0.5226284 - 3 0.9489993 - 4 0.5107078 - 5 0.6854605 - 6 1.0000000 - 7 1.3594509 - 8 1.0858702 - 9 1.1493587 - 10 1.0000000 - 11 0.3940306 - 12 0.2809766 - 13 1.0000000 - 14 1.0858702 - 15 1.1723308 - 16 1.0000000 - 17 1.0000000 - 18 1.1834934 - 19 1.0000000 - 20 1.0858702 - 21 1.2070771 - 22 1.0462596 - 23 0.5064835 - 24 1.0000000 - 25 1.0421602 - 26 1.0000000 - 27 1.0000000 - 28 0.6812093 - 29 1.0000000 - 30 0.9687816 + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a + 1 1 2.20 1 3 2 31 97593.679 1.0000000 + 2 7 2.08 2 3 1 29 39599.363 0.5502486 + 3 3 1.48 2 1 2 42 26619.834 0.9512543 + 4 4 1.32 2 4 2 33 34998.530 0.6766764 + 5 1 2.00 2 1 1 41 14746.454 0.7147710 + 6 6 2.20 2 4 1 38 28232.100 1.0000000 + 7 350 1.60 1 3 2 33 93162.431 1.8012419 + 8 NA 1.48 2 3 1 29 82275.986 1.1432570 + 9 3 2.28 2 4 1 41 24726.391 1.1985056 + 10 30 0.84 1 3 2 35 39895.048 1.0000000 + 11 70 1.24 1 4 2 33 27002.703 0.5220817 + 12 5 1.68 2 1 2 39 18792.034 0.3866720 + 13 60 2.20 1 3 2 30 76894.563 1.0000000 + 14 2 1.48 2 3 1 29 82275.986 1.1432570 + 15 8 2.36 2 3 2 39 78406.811 1.6133280 + 16 3 2.04 2 3 2 36 98200.912 1.0000000 + 17 1 2.08 1 3 1 40 87786.091 1.0000000 + 18 7 1.00 1 3 2 32 90803.158 1.2693642 + 19 9 2.28 2 3 2 34 45002.917 1.0000000 + 20 2 1.24 2 3 1 29 82275.986 1.1432570 + 21 4 2.28 2 3 1 34 91437.145 1.4088525 + 22 3 1.04 1 1 2 42 29348.027 1.0487457 + 23 4 1.12 1 1 1 34 38366.567 0.5911475 + 24 1 1.52 2 1 1 42 6622.334 1.0000000 + 25 22 2.24 1 4 1 41 22420.209 1.0867233 + 26 7 1.00 2 3 2 41 65529.204 1.0000000 + 27 5 0.92 2 4 1 30 27089.745 1.0000000 + 28 15 1.04 1 3 2 32 52265.570 0.7306358 + 29 3 0.80 1 3 1 33 64789.307 1.0000000 + 30 1 1.00 1 3 1 29 73404.222 1.0199804 + rescaled_weights_b + 1 1.0000000 + 2 0.5226284 + 3 0.9489993 + 4 0.5107078 + 5 0.6854605 + 6 1.0000000 + 7 1.3594509 + 8 1.0858702 + 9 1.1493587 + 10 1.0000000 + 11 0.3940306 + 12 0.2809766 + 13 1.0000000 + 14 1.0858702 + 15 1.1723308 + 16 1.0000000 + 17 1.0000000 + 18 1.1834934 + 19 1.0000000 + 20 1.0858702 + 21 1.2070771 + 22 1.0462596 + 23 0.5064835 + 24 1.0000000 + 25 1.0421602 + 26 1.0000000 + 27 1.0000000 + 28 0.6812093 + 29 1.0000000 + 30 0.9687816 diff --git a/tests/testthat/test-rescale_weights.R b/tests/testthat/test-rescale_weights.R index 504157180..7678e0816 100644 --- a/tests/testthat/test-rescale_weights.R +++ b/tests/testthat/test-rescale_weights.R @@ -1,14 +1,42 @@ test_that("rescale_weights works as expected", { data(nhanes_sample) + # convert tibble into data frame, so check-hard GHA works + nhanes_sample <- as.data.frame(nhanes_sample) - expect_snapshot(head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))) + expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))) - expect_snapshot(head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR"))) + expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU")))) + + expect_snapshot(head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish"))) + + out <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") + expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3) + expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3) + out <- rescale_weights(nhanes_sample, "WTINT2YR", method = "kish") + expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3) + out <- rescale_weights(nhanes_sample, "WTINT2YR", by = "SDMVPSU", method = "kish") + expect_equal(sum(out$rescaled_weights), 2163.3657, tolerance = 1e-3) +}) + + +test_that("rescale_weights works as expected", { + data(nhanes_sample) + # convert tibble into data frame, so check-hard GHA works + nhanes_sample <- as.data.frame(nhanes_sample)[1:20, ] + + # add NAs + set.seed(123) + nhanes_sample$WTINT2YR[sample.int(nrow(nhanes_sample), 5)] <- NA + + expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA")) + expect_snapshot(rescale_weights(nhanes_sample, "WTINT2YR", method = "kish")) }) test_that("rescale_weights nested works as expected", { data(nhanes_sample) + # convert tibble into data frame, so check-hard GHA works + nhanes_sample <- as.data.frame(nhanes_sample) expect_snapshot( rescale_weights( @@ -40,3 +68,78 @@ test_that("rescale_weights nested works as expected", { ) ) }) + + +test_that("rescale_weights errors and warnings", { + data(nhanes_sample) + expect_error( + rescale_weights( + data = head(nhanes_sample, n = 30), + by = c("a", "SDMVSTRA", "c"), + probability_weights = "WTINT2YR" + ), + regex = "The following" + ) + expect_error( + rescale_weights( + data = head(nhanes_sample, n = 30), + by = "SDMVSTRA", + probability_weights = NULL + ), + regex = "is missing, but required" + ) + expect_error( + rescale_weights( + data = head(nhanes_sample, n = 30), + by = NULL, + probability_weights = "WTINT2YR" + ), + regex = "must be specified" + ) + expect_error( + rescale_weights( + data = head(nhanes_sample, n = 30), + by = "abc", + probability_weights = "WTINT2YR", + method = "kish" + ), + regex = "The following variable" + ) + expect_warning( + rescale_weights( + data = head(nhanes_sample, n = 30), + by = "SDMVSTRA", + probability_weights = "WTINT2YR", + nest = TRUE, + method = "kish" + ), + regex = "is ignored" + ) + expect_error( + rescale_weights( + data = head(nhanes_sample, n = 30), + probability_weights = "WTINT2YR", + method = "dish" + ), + regex = "Invalid option for argument" + ) + + nhanes_sample$rescaled_weights_a <- 1 + expect_warning( + { + out <- rescale_weights( + data = head(nhanes_sample, n = 30), + by = "SDMVSTRA", + probability_weights = "WTINT2YR" + ) + }, + regex = "The variable name" + ) + expect_named( + out, + c( + "total", "age", "RIAGENDR", "RIDRETH1", "SDMVPSU", "SDMVSTRA", + "WTINT2YR", "rescaled_weights_a", "rescaled_weights_a_1", "rescaled_weights_b" + ) + ) +})