Skip to content

Commit

Permalink
Merge pull request #141 from metrumresearchgroup/factors
Browse files Browse the repository at this point in the history
ys_factors
  • Loading branch information
kylebaron authored Jul 25, 2023
2 parents 4dd75c3 + 533c5aa commit 8807324
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 2 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Imports:
stringr
Suggests: pander, testthat
VignetteBuilder: knitr
RoxygenNote: 7.1.2
RoxygenNote: 7.2.3
Collate:
'utils.R'
'Aaaa.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ export(ys_col_note)
export(ys_document)
export(ys_dont_sanitize)
export(ys_extend)
export(ys_factors)
export(ys_fill_dots)
export(ys_filter)
export(ys_get_label)
Expand Down
115 changes: 114 additions & 1 deletion R/col_factor.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#'
#' head(ys_add_factors(data, spec))
#'
#' @seealso [ys_factors()]
#'
#' @md
#' @export
ys_add_factors <- function(.data, .spec, ... ,
Expand All @@ -48,7 +50,7 @@ ys_add_factors <- function(.data, .spec, ... ,
if(length(what) > 0) {
what <- names(eval_select(expr(c(...)), .data))
}

if(length(what)==0 & isTRUE(.all)) {
dis <- map_lgl(.spec, ~!is.null(.x[["values"]]))
spec_cols <- names(which(dis | fct_ok))
Expand Down Expand Up @@ -106,3 +108,114 @@ ys_make_factor <- function(values, x, strict = TRUE, .missing = NULL) {
#' @rdname ys_add_factors
#' @export
yspec_make_factor <- ys_make_factor


#' Convert columns to factors
#'
#' This function works like [ys_add_factors()] with the difference that
#' the original columns become factors (retaining the original column names)
#' and the original columns are retained with a suffix. You can think of this
#' as a more convenient form of `ys_add_factors(..., .suffix = "")`.
#'
#' @inheritParams ys_add_factors
#' @param .keep_values logical; if `TRUE`, value columns will be retained with
#' a `.suffix`.
#' @param .suffix a suffix to be added to original columns (holding values).
#'
#' @return
#' The original data frame is returned with columns converted to factors
#' and (possibly) additional columns storing values.
#'
#' @details
#' Factor conversion will only take place on source columns that _aren't_
#' already factors. That is, if a column in `data` is already a factor, it
#' will be ignored. This means the function can be called multiple times on
#' the same input data, but once a column is converted to factor, it will
#' cannot be converted again in subsequent calls.
#'
#'
#' @examples
#'
#' library(dplyr)
#'
#' spec <- ys_help$spec()
#' data <- ys_help$data()
#'
#' data <- ys_factors(data, spec)
#'
#' head(data, 5)
#'
#' spec$EVID
#'
#' count(data, EVID, EVID_v)
#'
#' @seealso [ys_add_factors()]
#' @md
#' @export
ys_factors <- function(data, spec, ...,
.keep_values = TRUE,
.suffix = "_v") {

assert_that(is.data.frame(data))
assert_that(is_yspec(spec))

if(isTRUE(.keep_values) && identical(.suffix, "")) .suffix <- NULL
if(is.null(.suffix)) .keep_values <- FALSE

incoming_names <- names(data)

tag <- "__ys@factors__"

# Don't modify anything that is already a factor
factors <- which(sapply(data, is.factor))
already_factors <- names(data)[factors]

data <- ys_add_factors(data, spec, ..., .suffix = tag)

# Column indices that contain new factors
fct_cols <- which(grepl(tag, names(data), fixed = TRUE))
if(length(fct_cols)==0) return(data)
# Mangled names of columns that contain new factors
fct_names <- names(data)[fct_cols]

# Original names of columns to be converted
col_names <- sub(tag, "", names(data)[fct_cols], fixed = TRUE)

# If an incoming column is factor, we ignore; need to adjust both
# col_names and fct_cols
if(length(already_factors) > 0) {
drop <- col_names %in% already_factors
col_names <- col_names[!drop]
fct_cols <- fct_cols[!drop]
}

# Indices of original columns
col_cols <- match(col_names, names(data))

# Nothing left to work on; everything was already a factor
if(length(col_names)==0) {
data <- data[, !(grepl(tag, names(data), fixed = TRUE))]
return(data)
}

if(isTRUE(.keep_values)) {
names(data)[col_cols] <- paste0(col_names, .suffix)
}

# Set names back to original
names(data)[fct_cols] <- col_names

if(!isTRUE(.keep_values)) {
data[,col_cols] <- NULL
}

# Drop any tagged columns
data <- data[, !(grepl(tag, names(data), fixed = TRUE))]

# Restore column order
new_names <- names(data)
select_names <- unique(c(incoming_names, new_names))
data <- data[, select_names]

return(data)
}
Binary file modified inst/test_data/test1.RDS
Binary file not shown.
Binary file modified inst/test_data/test2.RDS
Binary file not shown.
3 changes: 3 additions & 0 deletions man/ys_add_factors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 57 additions & 0 deletions man/ys_factors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 91 additions & 0 deletions tests/testthat/test-col_factor.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,94 @@ test_that("tidyselect semantics when identifying columns for factor [YSP-TEST-00
expect_true("PHASE_f" %in% names(ans))
expect_true("STUDY_f" %in% names(ans))
})

test_that("ys_factors converts originals to factors", {
data <- ys_help$data()
spec <- ys_help$spec()

expect_true(is.numeric(data$EVID))
expect_false("EVID_v" %in% names(data))

data <- ys_factors(data, spec, EVID)
expect_is(data$EVID, "factor")
expect_equal(levels(data$EVID), spec$EVID$decode)

expect_true("EVID_v" %in% names(data))

expect_error(ys_factors(data, spec, KYLE),
"Column `KYLE` doesn't exist.")
})

test_that("ys_factors retains original values by default", {
data0 <- ys_help$data()
spec <- ys_help$spec()

expect_false("EVID_v" %in% names(data0))

data <- ys_factors(data0, spec, EVID)

expect_true("EVID_v" %in% names(data))
expect_equal(unique(data$EVID_v), c(1,0))

data <- ys_factors(data0, spec, EVID, .suffix = "values")
expect_true("EVIDvalues" %in% names(data))
})

test_that("ys_factors discard original values", {
data <- ys_help$data()
spec <- ys_help$spec()

data1 <- ys_factors(data, spec, .keep_values = FALSE)
expect_identical(names(data), names(data1))

expect_true(is.numeric(data$EVID))
expect_true(is.factor(data1$EVID))

data2 <- ys_factors(data, spec, .suffix = NULL)
expect_identical(data1, data2)
})

test_that("ys_factors will convert everything", {
data <- ys_help$data()
spec <- ys_help$spec()

decode <- purrr::map(spec, "decode")
mkf <- purrr::map(spec, "make_factor")
decode <- purrr::compact(decode)
mkf <- purrr::compact(mkf)
ndecode <- length(decode) + length(mkf)

data <- ys_factors(data, spec)
values <- names(data)[grep("_v", names(data), fixed = TRUE)]
expect_equal(length(values), ndecode)

nw <- data[, sapply(data, is.factor)]
expect_equal(ncol(nw), ndecode)
})

test_that("call ys_factors more than once on a data frame", {
data <- ys_help$data()
spec <- ys_help$spec()

data2 <- ys_factors(data, spec)
data3 <- ys_factors(data2, spec)
expect_identical(data2, data3)

labs <- names(data)
data4 <- ys_factors(data, spec, EVID)
expect_identical(names(data4), c(labs, "EVID_v"))
data5 <- ys_factors(data4, spec, CP, EVID)
expect_identical(names(data5), c(labs, "EVID_v", "CP_v"))

data6 <- ys_factors(data5, spec)
data7 <- ys_factors(data6, spec)
expect_identical(data6, data7)
})

test_that("don't keep values when suffix is empty", {
data <- ys_help$data()
spec <- ys_help$spec()
data <- ys_factors(data, spec, .suffix = "")
expect_is(data$EVID, "factor")
expect_null(data$EVID_v)
})

0 comments on commit 8807324

Please sign in to comment.