Skip to content

Commit

Permalink
data_tabulate() gains by argument for crosstables (#481)
Browse files Browse the repository at this point in the history
* `data_tabluate()` gains `by` argument for crosstables

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* version

* fix

* update tests

* fixes update tests

* docs, add print_html methods

* update news

* code structure

* fixes

* add tests

* print markdown method

* add tests for markdown print

* lintr

* align values in tables

* tests for HTML

* add test

* use same column as rowname

* Update NEWS.md

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>

* address comments

* cell -> full

* update snapshots

* one more test

---------

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
  • Loading branch information
strengejacke and etiennebacher authored Feb 13, 2024
1 parent 3358b3e commit be6e2bf
Show file tree
Hide file tree
Showing 10 changed files with 1,024 additions and 42 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1.3
Version: 0.9.1.4
Authors@R: c(
person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ S3method(describe_distribution,numeric)
S3method(format,data_codebook)
S3method(format,dw_data_peek)
S3method(format,dw_data_tabulate)
S3method(format,dw_data_xtabulate)
S3method(format,dw_groupmeans)
S3method(format,parameters_distribution)
S3method(kurtosis,data.frame)
Expand All @@ -91,6 +92,8 @@ S3method(print,data_seek)
S3method(print,dw_data_peek)
S3method(print,dw_data_tabulate)
S3method(print,dw_data_tabulates)
S3method(print,dw_data_xtabulate)
S3method(print,dw_data_xtabulates)
S3method(print,dw_groupmeans)
S3method(print,dw_groupmeans_list)
S3method(print,dw_transformer)
Expand All @@ -102,10 +105,13 @@ S3method(print_html,data_codebook)
S3method(print_html,dw_data_peek)
S3method(print_html,dw_data_tabulate)
S3method(print_html,dw_data_tabulates)
S3method(print_html,dw_data_xtabulate)
S3method(print_html,dw_data_xtabulates)
S3method(print_md,data_codebook)
S3method(print_md,dw_data_peek)
S3method(print_md,dw_data_tabulate)
S3method(print_md,dw_data_tabulates)
S3method(print_md,dw_data_xtabulate)
S3method(ranktransform,data.frame)
S3method(ranktransform,factor)
S3method(ranktransform,grouped_df)
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ CHANGES
* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
variables at specific positions or based on logical conditions.

* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables.
* `data_tabulate()` was revised and gets several new arguments: a `weights`
argument, to compute weighted frequency tables. `include_na` allows to include
or omit missing values from the table. Furthermore, a `by` argument was added,
to compute crosstables (#479, #481).

# datawizard 0.9.1

Expand Down
6 changes: 3 additions & 3 deletions R/data_arrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
dont_exist <- select[which(!select %in% names(data))]

if (length(dont_exist) > 0) {
if (!safe) {
insight::format_error(
if (safe) {
insight::format_warning(
paste0(
"The following column(s) don't exist in the dataset: ",
text_concatenate(dont_exist), "."
),
.misspelled_string(names(data), dont_exist, "Possibly misspelled?")
)
} else {
insight::format_warning(
insight::format_error(
paste0(
"The following column(s) don't exist in the dataset: ",
text_concatenate(dont_exist), "."
Expand Down
160 changes: 136 additions & 24 deletions R/data_tabulate.R
Original file line number Diff line number Diff line change
@@ -1,32 +1,47 @@
#' @title Create frequency tables of variables
#' @title Create frequency and crosstables of variables
#' @name data_tabulate
#'
#' @description This function creates frequency tables of variables, including
#' the number of levels/values as well as the distribution of raw, valid and
#' cumulative percentages.
#' @description This function creates frequency or crosstables of variables,
#' including the number of levels/values as well as the distribution of raw,
#' valid and cumulative percentages. For crosstables, row, column and cell
#' percentages can be calculated.
#'
#' @param x A (grouped) data frame, a vector or factor.
#' @param drop_levels Logical, if `TRUE`, factor levels that do not occur in
#' @param by Optional vector or factor. If supplied, a crosstable is created.
#' If `x` is a data frame, `by` can also be a character string indicating the
#' name of a variable in `x`.
#' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in
#' the data are included in the table (with frequency of zero), else unused
#' factor levels are dropped from the frequency table.
#' @param name Optional character string, which includes the name that is used
#' for printing.
#' @param include_na Logical, if `TRUE`, missing values are included in the
#' frequency or crosstable, else missing values are omitted.
#' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
#' table for printing. This affects only printing, not the returned object.
#' @param weights Optional numeric vector of weights. Must be of the same length
#' as `x`. If `weights` is supplied, weighted frequencies are calculated.
#' @param proportions Optional character string, indicating the type of
#' percentages to be calculated. Only applies to crosstables, i.e. when `by` is
#' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages)
#' or `"full"` (to calculate relative frequencies for the full table).
#' @param ... not used.
#' @inheritParams find_columns
#'
#' @return A data frame, or a list of data frames, with one frequency table
#' as data frame per variable.
#'
#' @examplesIf requireNamespace("poorman")
#' # frequency tables -------
#' # ------------------------
#' data(efc)
#'
#' # vector/factor
#' data_tabulate(efc$c172code)
#'
#' # drop missing values
#' data_tabulate(efc$c172code, include_na = FALSE)
#'
#' # data frame
#' data_tabulate(efc, c("e42dep", "c172code"))
#'
Expand Down Expand Up @@ -54,6 +69,30 @@
#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
#' data_tabulate(efc$e42dep, weights = efc$weights)
#'
#' # crosstables ------
#' # ------------------
#'
#' # add some missing values
#' set.seed(123)
#' efc$e16sex[sample.int(nrow(efc), 5)] <- NA
#'
#' data_tabulate(efc, "c172code", by = "e16sex")
#'
#' # add row and column percentages
#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row")
#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
#'
#' # omit missing values
#' data_tabulate(
#' efc$c172code,
#' by = efc$e16sex,
#' proportions = "column",
#' include_na = FALSE
#' )
#'
#' # round percentages
#' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
#' print(out, digits = 0)
#' @export
data_tabulate <- function(x, ...) {
UseMethod("data_tabulate")
Expand All @@ -62,7 +101,15 @@ data_tabulate <- function(x, ...) {

#' @rdname data_tabulate
#' @export
data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) {
data_tabulate.default <- function(x,
by = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
proportions = NULL,
name = NULL,
verbose = TRUE,
...) {
# save label attribute, before it gets lost...
var_label <- attr(x, "label", exact = TRUE)

Expand All @@ -78,25 +125,52 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
x <- droplevels(x)
}

# check for correct length of weights - must be equal to "x"
if (!is.null(weights) && length(weights) != length(x)) {
insight::format_error("Length of `weights` must be equal to length of `x`.")
# validate "weights"
weights <- .validate_table_weights(weights, x)

# we go into another function for crosstables here...
if (!is.null(by)) {
by <- .validate_by(by, x)
return(.crosstable(
x,
by = by,
weights = weights,
include_na = include_na,
proportions = proportions,
obj_name = obj_name,
group_variable = group_variable
))
}

# frequency table
if (is.null(weights)) {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
} else {
# weighted frequency table
if (include_na) {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
} else {
freq_table <- tryCatch(table(x), error = function(e) NULL)
}
} else if (include_na) {
# weighted frequency table, including NA
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = x),
data = data.frame(weights = weights, x = addNA(x)),
na.action = stats::na.pass,
addNA = TRUE
),
error = function(e) NULL
)
} else {
# weighted frequency table, excluding NA
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = x),
na.action = stats::na.omit,
addNA = FALSE
),
error = function(e) NULL
)
}

if (is.null(freq_table)) {
Expand All @@ -115,7 +189,14 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
}

out$`Raw %` <- 100 * out$N / sum(out$N)
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
# if we have missing values, we add a row with NA
if (include_na) {
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
} else {
out$`Valid %` <- 100 * out$N / sum(out$N)
valid_n <- sum(out$N, na.rm = TRUE)
}
out$`Cumulative %` <- cumsum(out$`Valid %`)

# add information about variable/group names
Expand Down Expand Up @@ -144,7 +225,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
attr(out, "weights") <- weights

attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE)
attr(out, "valid_n") <- valid_n

class(out) <- c("dw_data_tabulate", "data.frame")

Expand All @@ -159,9 +240,12 @@ data_tabulate.data.frame <- function(x,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
collapse = FALSE,
by = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
proportions = NULL,
collapse = FALSE,
verbose = TRUE,
...) {
# evaluate arguments
Expand All @@ -172,11 +256,31 @@ data_tabulate.data.frame <- function(x,
regex = regex,
verbose = verbose
)

# validate "by"
by <- .validate_by(by, x)
# validate "weights"
weights <- .validate_table_weights(weights, x)

out <- lapply(select, function(i) {
data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...)
data_tabulate(
x[[i]],
by = by,
proportions = proportions,
drop_levels = drop_levels,
weights = weights,
include_na = include_na,
name = i,
verbose = verbose,
...
)
})

class(out) <- c("dw_data_tabulates", "list")
if (is.null(by)) {
class(out) <- c("dw_data_tabulates", "list")
} else {
class(out) <- c("dw_data_xtabulates", "list")
}
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

Expand All @@ -190,10 +294,13 @@ data_tabulate.grouped_df <- function(x,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
collapse = FALSE,
by = NULL,
proportions = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
collapse = FALSE,
verbose = TRUE,
...) {
# works only for dplyr >= 0.8.0
grps <- attr(x, "groups", exact = TRUE)
Expand All @@ -210,6 +317,7 @@ data_tabulate.grouped_df <- function(x,
)

x <- as.data.frame(x)

out <- list()
for (i in seq_along(grps)) {
rows <- grps[[i]]
Expand All @@ -227,20 +335,25 @@ data_tabulate.grouped_df <- function(x,
verbose = verbose,
drop_levels = drop_levels,
weights = weights,
include_na = include_na,
by = by,
proportions = proportions,
group_variable = group_variable,
...
))
}
class(out) <- c("dw_data_tabulates", "list")
if (is.null(by)) {
class(out) <- c("dw_data_tabulates", "list")
} else {
class(out) <- c("dw_data_xtabulates", "list")
}
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

out
}




# methods --------------------

#' @importFrom insight print_html
Expand Down Expand Up @@ -286,7 +399,6 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) {
}



#' @export
print.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
a <- attributes(x)
Expand Down
Loading

0 comments on commit be6e2bf

Please sign in to comment.