From a343d6e6606c2220041620082c56a5bf33a3334c Mon Sep 17 00:00:00 2001 From: Richard Hanna Date: Tue, 24 Sep 2024 14:46:02 -0400 Subject: [PATCH] Update documentation, test methods for mixed joins --- NAMESPACE | 1 + R/join_data_tibbles.R | 148 +++++++++++++++++++++++++++++---------- man/build_by.Rd | 24 +++++++ man/extract_keys.Rd | 21 ++++++ man/get_join_fn.Rd | 19 +++++ man/get_structure.Rd | 17 +++++ man/get_type.Rd | 17 +++++ man/join_data_tibbles.Rd | 69 ++++++++++++++++++ 8 files changed, 279 insertions(+), 37 deletions(-) create mode 100644 man/build_by.Rd create mode 100644 man/extract_keys.Rd create mode 100644 man/get_join_fn.Rd create mode 100644 man/get_structure.Rd create mode 100644 man/get_type.Rd create mode 100644 man/join_data_tibbles.Rd diff --git a/NAMESPACE b/NAMESPACE index b74e60ed..e584bcaf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(fmt_strip_html) export(fmt_strip_trailing_colon) export(fmt_strip_trailing_punct) export(fmt_strip_whitespace) +export(join_data_tibbles) export(make_labelled) export(read_redcap) export(write_redcap_xlsx) diff --git a/R/join_data_tibbles.R b/R/join_data_tibbles.R index 591e4465..57a9871d 100644 --- a/R/join_data_tibbles.R +++ b/R/join_data_tibbles.R @@ -1,65 +1,107 @@ -join_data_tibbles <- function(suprtbl, +#' @title Join Two Data Tibbles from a Supertibble +#' +#' @description +#' The [join_data_tibbles()] function provides a way to intelligently join two +#' data tibbles from a REDCaTidieR supertibble. A supertibble is an output of +#' [read_redcap()]. +#' +#' [join_data_tibbles()] attempts to correctly assign the `by` when left `NULL` (the default) +#' based on detecting the data tibble structure of `x` and `y`. +#' +#' @inheritParams extract_tibbles +#' @param type A character string for the type of join to be performed borrowing from +#' dplyr. One of "left", "right", "inner", or "full". Default "left". +#' @inheritParams dplyr::inner_join +#' +#' +#' @returns A `tibble`. +#' +#' @export + +join_data_tibbles <- function(supertbl, x, y, by = NULL, type = "left", suffix = c(".x", ".y")) { - record_id_field <- get_record_id_field(suprtbl$redcap_data[[1]]) # nolint: object_usage_linter + record_id_field <- get_record_id_field(supertbl$redcap_data[[1]]) # nolint: object_usage_linter join_fn <- get_join_fn(type) # Append the supertibble with the primary keys column - suprtbl <- suprtbl |> + supertbl <- supertbl |> mutate(pks = purrr::map_chr(.data$redcap_data, ~ extract_keys(., record_id_field = record_id_field))) %>% select(.data$redcap_form_name, .data$redcap_form_label, .data$redcap_data, - .data$redcap_metadata, .data$structure, .data$pks, .data$redcap_events) + .data$redcap_metadata, .data$structure, .data$pks, matches("redcap_events")) - tbl_x <- extract_tibble(suprtbl, x) - tbl_x_structure <- get_structure(suprtbl, x) - tbl_y <- extract_tibble(suprtbl, y) - tbl_y_structure <- get_structure(suprtbl, y) + tbl_x <- extract_tibble(supertbl, x) + tbl_x_structure <- get_structure(supertbl, x) + tbl_y <- extract_tibble(supertbl, y) + tbl_y_structure <- get_structure(supertbl, y) # Mixed structure requires special handling is_mixed <- any(c(tbl_x_structure, tbl_y_structure) == "mixed") if (is_mixed) { # TODO: Determine if ok to remove - # required_columns <- c("redcap_event_instance", "redcap_form_instance") # nolint: commented_code_linter - # tbl_x <- add_missing_columns(tbl_x, required_columns) # nolint: commented_code_linter - # tbl_y <- add_missing_columns(tbl_y, required_columns) # nolint: commented_code_linter + required_columns <- c("redcap_event_instance", "redcap_form_instance") # nolint: commented_code_linter + tbl_x <- add_missing_columns(tbl_x, required_columns) # nolint: commented_code_linter + tbl_y <- add_missing_columns(tbl_y, required_columns) # nolint: commented_code_linter - tbl_x_type <- get_type(suprtbl, x) - tbl_y_type <- get_type(suprtbl, y) + tbl_x_type <- get_type(supertbl, x) + tbl_y_type <- get_type(supertbl, y) + # Add on .repeat_type specifier for the redcap_event column tbl_x <- left_join(tbl_x, tbl_x_type, by = "redcap_event") tbl_y <- left_join(tbl_y, tbl_y_type, by = "redcap_event") } join_fn <- get_join_fn(type) - by <- build_by(suprtbl, x, y, is_mixed) + by <- build_by(supertbl, x, y, is_mixed) - join_tbls(tbl_x, tbl_y, join_fn, by, suffix, is_mixed) %>% + join_fn(tbl_x, tbl_y, by, suffix) %>% relocate(starts_with("form_status_complete"), .after = everything()) %>% select(-starts_with(".repeat_type")) } -extract_keys <- function(suprtbl, record_id_field) { +#' @title Extract the primary keys associated with a data tibble +#' +#' @param data_tbl A data tibble from a supertibble +#' @param record_id_field The record ID field for the REDCap project, retrieved +#' as an ouput of [get_record_id_field()] +#' +#' @returns a character string +#' +#' @keywords internal +extract_keys <- function(data_tbl, record_id_field) { redcap_keys <- c( record_id_field, "redcap_event", "redcap_form_instance", "redcap_event_instance", "redcap_arm" ) - suprtbl |> + data_tbl |> colnames() |> intersect(redcap_keys) |> paste(collapse = ", ") } -get_structure <- function(suprtbl, tbl_name) { - suprtbl$structure[suprtbl$redcap_form_name == tbl_name] +#' @title Retrieve the structure data for a form from the supertibble +#' +#' @inheritParams join_data_tibbles +#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles` +#' +#' @keywords internal +get_structure <- function(supertbl, tbl_name) { + supertbl$structure[supertbl$redcap_form_name == tbl_name] } -get_type <- function(suprtbl, tbl_name) { - suprtbl %>% +#' @title Retrieve the repeat event type data for a form from the supertibble +#' +#' @inheritParams join_data_tibbles +#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles` +#' +#' @keywords internal +get_type <- function(supertbl, tbl_name) { + supertbl %>% filter(.data$redcap_form_name == tbl_name) %>% pull(.data$redcap_events) %>% pluck(1) %>% @@ -68,6 +110,13 @@ get_type <- function(suprtbl, tbl_name) { unique() } +#' @title Retrieve the appropriate user specified join function +#' +#' @inheritParams join_data_tibbles +#' +#' @returns a function +#' +#' @keywords internal get_join_fn <- function(type) { join_functions <- list( left = dplyr::left_join, @@ -76,24 +125,32 @@ get_join_fn <- function(type) { full = dplyr::full_join ) - # Check if the specified type is valid - # TODO: Make a standard check function with cli if (!type %in% names(join_functions)) { - stop("Invalid join type. Choose from 'left', 'right', 'inner', or 'full'.") + cli::cli_abort("Invalid join type. Choose from 'left', 'right', 'inner', or 'full'.") } join_functions[[type]] } -build_by <- function(suprtbl, x, y, is_mixed) { - x_pks <- suprtbl$pks[suprtbl$redcap_form_name == x] %>% +#' @title Intelligently retrieve the join by cols +#' +#' @inheritParams join_data_tibbles +#' @param is_mixed TRUE/FALSE, whether or not the given tables contain a mixed structure +#' +#' @returns a character vector +#' +#' @keywords interal +build_by <- function(supertbl, x, y, is_mixed) { + x_pks <- supertbl$pks[supertbl$redcap_form_name == x] %>% stringr::str_split(", ", simplify = TRUE) - y_pks <- suprtbl$pks[suprtbl$redcap_form_name == y] %>% + y_pks <- supertbl$pks[supertbl$redcap_form_name == y] %>% stringr::str_split(", ", simplify = TRUE) out <- intersect(x_pks, y_pks) if (is_mixed) { + # For mixed tables, depending on the .repeat_types present tables may not + # have event and form instance columns and must be added out <- c(out, "redcap_event_instance", "redcap_form_instance") %>% # TODO: Make standard, currently needed for repeat/mixed joins unique() @@ -102,37 +159,54 @@ build_by <- function(suprtbl, x, y, is_mixed) { out } -# TODO: Determine if ok to remove +#' @keywords intenral +#' @noRd add_missing_columns <- function(tbl, columns) { missing_cols <- setdiff(columns, names(tbl)) tbl[missing_cols] <- NA return(tbl) } +#' @title Join data tbls of various structures and types +#' +#' @description +#' [join_tbls()] either performs the `join_fun()` specified by the `type` or, in +#' the event of mixed structure data tibble joins, will seek to split data into +#' three categories before performing the joins. The key identifiers here are +#' `redcap_form_instance` and the added `.repeat_type` columns. + join_tbls <- function(x, y, join_fn, by, suffix, is_mixed) { if (is_mixed) { # Filter based on .repeat_type + # If repeating together events, can use redcap_form_instance (NA) and redcap_event_instance x_together <- x %>% filter(.data$.repeat_type == "repeat_together") y_together <- y %>% filter(.data$.repeat_type == "repeat_together") - x_separate <- x %>% filter(.data$.repeat_type == "repeat_separate") - y_separate <- y %>% filter(.data$.repeat_type == "repeat_separate") + # repeating instruments for separately repeating events shouldn't be joined by redcap_form_instance + x_separate_repeating <- x %>% filter(.data$.repeat_type == "repeat_separate" & !is.na(.data$redcap_form_instance)) + y_separate_repeating <- y %>% filter(.data$.repeat_type == "repeat_separate" & !is.na(.data$redcap_form_instance)) + + # nonrepeating instruments for separately repeating events should be joined by redcap_form_instance + x_separate_nonrepeating <- x %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance)) + y_separate_nonrepeating <- y %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance)) # Join together sets joined_together <- x_together %>% - join_fn(y_together, by = by[by != "redcap_form_instance"], suffix = suffix) + join_fn(y_together, by = by, suffix = suffix) - # Join separate sets - joined_separate <- x_separate %>% - join_fn(y_separate, by = by[by != "redcap_form_instance"], suffix = suffix) + joined_separate_repeating <- x_separate_repeating %>% + join_fn(y_separate_repeating, by = by[by != "redcap_form_instance"], suffix = suffix) - # Bind rows together - result <- bind_rows(joined_together, joined_separate) %>% + joined_separate_nonrepeating <- x_separate_nonrepeating %>% + join_fn(y_separate_nonrepeating, by = by, suffix = suffix) + + # Bind rows together, issue in arrangmenet of output + result <- bind_rows(joined_together, joined_separate_repeating) %>% + bind_rows(joined_separate_nonrepeating) %>% drop_non_suffix_columns() } else { result <- join_fn(x, y, by = by, suffix = suffix) } - result } diff --git a/man/build_by.Rd b/man/build_by.Rd new file mode 100644 index 00000000..e491c439 --- /dev/null +++ b/man/build_by.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{build_by} +\alias{build_by} +\title{Intelligently retrieve the join by cols} +\usage{ +build_by(supertbl, x, y, is_mixed) +} +\arguments{ +\item{supertbl}{A supertibble generated by \code{read_redcap()}. Required.} + +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{is_mixed}{TRUE/FALSE, whether or not the given tables contain a mixed structure} +} +\value{ +a character vector +} +\description{ +Intelligently retrieve the join by cols +} +\keyword{interal} diff --git a/man/extract_keys.Rd b/man/extract_keys.Rd new file mode 100644 index 00000000..b497120b --- /dev/null +++ b/man/extract_keys.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{extract_keys} +\alias{extract_keys} +\title{Extract the primary keys associated with a data tibble} +\usage{ +extract_keys(data_tbl, record_id_field) +} +\arguments{ +\item{data_tbl}{A data tibble from a supertibble} + +\item{record_id_field}{The record ID field for the REDCap project, retrieved +as an ouput of \code{\link[=get_record_id_field]{get_record_id_field()}}} +} +\value{ +a character string +} +\description{ +Extract the primary keys associated with a data tibble +} +\keyword{internal} diff --git a/man/get_join_fn.Rd b/man/get_join_fn.Rd new file mode 100644 index 00000000..f99d7b01 --- /dev/null +++ b/man/get_join_fn.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{get_join_fn} +\alias{get_join_fn} +\title{Retrieve the appropriate user specified join function} +\usage{ +get_join_fn(type) +} +\arguments{ +\item{type}{A character string for the type of join to be performed borrowing from +dplyr. One of "left", "right", "inner", or "full". Default "left".} +} +\value{ +a function +} +\description{ +Retrieve the appropriate user specified join function +} +\keyword{internal} diff --git a/man/get_structure.Rd b/man/get_structure.Rd new file mode 100644 index 00000000..b8a901eb --- /dev/null +++ b/man/get_structure.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{get_structure} +\alias{get_structure} +\title{Retrieve the structure data for a form from the supertibble} +\usage{ +get_structure(supertbl, tbl_name) +} +\arguments{ +\item{supertbl}{A supertibble generated by \code{read_redcap()}. Required.} + +\item{tbl_name}{the \code{x} or \code{y} values assigned to \code{join_data_tibbles}} +} +\description{ +Retrieve the structure data for a form from the supertibble +} +\keyword{internal} diff --git a/man/get_type.Rd b/man/get_type.Rd new file mode 100644 index 00000000..f3088095 --- /dev/null +++ b/man/get_type.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{get_type} +\alias{get_type} +\title{Retrieve the repeat event type data for a form from the supertibble} +\usage{ +get_type(supertbl, tbl_name) +} +\arguments{ +\item{supertbl}{A supertibble generated by \code{read_redcap()}. Required.} + +\item{tbl_name}{the \code{x} or \code{y} values assigned to \code{join_data_tibbles}} +} +\description{ +Retrieve the repeat event type data for a form from the supertibble +} +\keyword{internal} diff --git a/man/join_data_tibbles.Rd b/man/join_data_tibbles.Rd new file mode 100644 index 00000000..75dc94b1 --- /dev/null +++ b/man/join_data_tibbles.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_data_tibbles.R +\name{join_data_tibbles} +\alias{join_data_tibbles} +\title{Join Two Data Tibbles from a Supertibble} +\usage{ +join_data_tibbles( + supertbl, + x, + y, + by = NULL, + type = "left", + suffix = c(".x", ".y") +) +} +\arguments{ +\item{supertbl}{A supertibble generated by \code{read_redcap()}. Required.} + +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{type}{A character string for the type of join to be performed borrowing from +dplyr. One of "left", "right", "inner", or "full". Default "left".} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} +} +\value{ +A \code{tibble}. +} +\description{ +The \code{\link[=join_data_tibbles]{join_data_tibbles()}} function provides a way to intelligently join two +data tibbles from a REDCaTidieR supertibble. A supertibble is an output of +\code{\link[=read_redcap]{read_redcap()}}. + +\code{\link[=join_data_tibbles]{join_data_tibbles()}} attempts to correctly assign the \code{by} when left \code{NULL} (the default) +based on detecting the data tibble structure of \code{x} and \code{y}. +}