Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing create_analysis_median #38

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: analysistools
Title: Tools to perform some analysis on survey data collected with ODK
Version: 0.0.0.902
Version: 0.0.0.903
Authors@R: c(
person("Yann", "Say", , "yann.say@impact-initiatives.org", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-7390-4209")),
Expand All @@ -21,7 +21,7 @@ Imports:
magrittr,
purrr,
rlang,
srvyr,
srvyr (>= 1.3.0),
stringr,
tidyr,
tidyselect
Expand All @@ -37,5 +37,5 @@ Config/testthat/edition: 3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
URL: https://impact-initiatives.github.io/analysistools/
99 changes: 53 additions & 46 deletions R/create_analysis_median.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
#' @param analysis_var the independent variable, variable to summarise
#' @param level the confidence level. 0.95 is default
#'
#' @note The results may differ with median(). There are lots of ways to calculate the median and
#' the default calculation between stats::median and survey::svyquantile/srvyr::survey_median are
#' different. Default from *survey/srvyr* is "school" methodology and does not exist in *stats*
#' package. The default for *stats* is "hf7". *survey/srvyr* methodology is prefered as these
#' packages are built for complex survey design.
#' @note Default from *survey/srvyr* is "math" methodology. In case of odds number, it will return
#' the lower value. Default for stats::median will calculate the mean between the two points.
#' If there is a set of c(1,2), median(1,2) will return 1.5; survey_mean() will return 1 by default.
#' create_analysis_median has the "school" methodology set as default, the results will match the
#' default results from stats::median, pandas.median If want to calculate with the "math"
#' methodology, you should run your own analysis with survey_median.
#'
#' @return a data frame with the median for each group
#' @export
Expand Down Expand Up @@ -42,57 +43,63 @@ create_analysis_median <- function(design, group_var = NA, analysis_var, level =
}

# calculate
pre_design <- design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by)))

## error handling
## survey_median has an error with only NA it passes somewhere if(NA)
## To handle this problem, the missing_value_catch will try to run summarise around
## survey_quantile, if it does work, it will run survey_mean to get the NA/NaN.
## The error also happens with svyby and svyquantile and cannot be swapped
# as 26.11.2024
# currently srvyr::survey_median return error with only NAs in a group
# Caused by error in `h()`:
# ! error in evaluating the argument 'x' in selecting a method for function 't':
# missing value where TRUE/FALSE needed
#
# fix:
# - calculates median with filter(.preserve = FALSE) (default). it removes groups with all NA
# - calculates the counts and weigthed counts with .preserve = TRUE to have all groups
# - left_join on the counts to have all groups.
#
# - when all are missing return a dataframe with stat,stat_upp,stat_low as NaN as
# filter(.preserve = FALSE will break)

missing_value_catch <- function(expr) {
tryCatch(
error = function(cnd) {
design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
srvyr::summarise(
stat = srvyr::survey_mean(
!!rlang::sym(analysis_var),
vartype = "ci",
level = as.numeric(level),
na.rm = T
),
n = dplyr::n(),
n_w = srvyr::survey_total(
vartype = "ci",
level = as.numeric(level),
na.rm = T
)
)
},
expr
)
}

results <- missing_value_catch(
design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
# fix for 26.11.2024 bug
if (all(is.na(design[["variables"]][[analysis_var]]))) {
# edge case when all NA
results_median <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var))) %>%
dplyr::summarise(
stat = NaN,
stat_upp = NaN,
stat_low = NaN
)
} else {
results_median <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = FALSE) %>%
srvyr::summarise(
stat = srvyr::survey_median(
!!rlang::sym(analysis_var),
vartype = "ci",
level = as.numeric(level),
na.rm = T
),
n = dplyr::n(),
n_w = srvyr::survey_total(
vartype = "ci",
qrule = "school",
level = as.numeric(level),
na.rm = T
)
)
)
}

results_totals <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = TRUE) %>%
srvyr::summarise(
n = dplyr::n(),
n_w = srvyr::survey_total(
na.rm = T
)
)

if (is.null(across_by)) {
results <- cbind(results_median, results_totals)
} else {
results <- results_totals %>%
dplyr::left_join(results_median) %>%
suppressMessages()
}

results <- results %>%
dplyr::mutate(
Expand Down
11 changes: 6 additions & 5 deletions man/create_analysis_median.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 30 additions & 4 deletions tests/testthat/test-create_analysis_median.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_that("create_analysis_median returns correct output, no weights", {
)

svyquantile_results <- survey::svydesign(id = ~1, data = somedata) %>%
survey::svyquantile(~value, design = ., quantiles = c(.5)) %>%
survey::svyquantile(~value, design = ., quantiles = c(.5), qrule = "school") %>%
suppressWarnings()

expected_output <- svyquantile_results[["value"]] %>%
Expand Down Expand Up @@ -295,7 +295,9 @@ test_that("create_analysis_median handles lonely PSU", {
~groups,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -411,7 +413,9 @@ test_that("create_analysis_median returns correct output with 3 grouping variabl
~ group_a + group_b + group_c,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -506,7 +510,9 @@ test_that("create_analysis_median returns correct output with 2 grouping variabl
~ group_a + group_b,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -590,3 +596,23 @@ test_that("stat is set to NaN when there is no value", {

expect_equal(results, expected_output, ignore_attr = T)
})

test_that("When only missing values, the correct values are return", {
set.seed(3452)
repex_df <- data.frame(group = c(rep("a", 5), rep("b",3)),
value = c(runif(5), rep(NA, 3)))
expected_results <- repex_df |>
dplyr::group_by(group) |>
dplyr::summarise(stat = median(value, na.rm = T)) |>
dplyr::mutate(stat = dplyr::if_else(is.na(stat), NaN, stat))

data_survey_design <- srvyr::as_survey(repex_df)

actual_results <- create_analysis_median(data_survey_design,
group_var = "group",
analysis_var = "value")

expect_equal(actual_results$stat, expected_results$stat)

})

Loading