From d6c90f0561a4e2efb252e0cb8843fd9596d1571b Mon Sep 17 00:00:00 2001
From: Ezra Porter <60618324+ezraporter@users.noreply.github.com>
Date: Thu, 14 Mar 2024 14:43:19 -0400
Subject: [PATCH] add haven option to labels

---
 DESCRIPTION                   |   4 +-
 NAMESPACE                     |  10 +++
 R/REDCapTidieR-package.R      |   6 +-
 R/read_redcap.R               |  15 ++--
 R/utils.R                     | 143 ++++++++++++++++++++++++++++------
 man/apply_labs_factor.Rd      |  22 ++++++
 man/apply_labs_haven.Rd       |  31 ++++++++
 man/invert_vec.Rd             |  18 +++++
 man/multi_choice_to_labels.Rd |   7 +-
 man/read_redcap.Rd            |   5 +-
 tests/testthat/test-utils.R   |  56 +++++++++++++
 11 files changed, 280 insertions(+), 37 deletions(-)
 create mode 100644 man/apply_labs_factor.Rd
 create mode 100644 man/apply_labs_haven.Rd
 create mode 100644 man/invert_vec.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index c637efa4..d8bb60e9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -33,7 +33,8 @@ Imports:
     tidyselect,
     formattable,
     pillar,
-    vctrs
+    vctrs,
+    readr
 Suggests: 
     covr,
     knitr,
@@ -41,7 +42,6 @@ Suggests:
     lintr,
     openxlsx2 (>= 0.8),
     prettyunits,
-    readr,
     rmarkdown,
     skimr,
     testthat (>= 3.0.0),
diff --git a/NAMESPACE b/NAMESPACE
index 37c2d663..bd72cbd5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -56,6 +56,8 @@ importFrom(dplyr,slice)
 importFrom(dplyr,summarise)
 importFrom(formattable,percent)
 importFrom(lobstr,obj_size)
+importFrom(lubridate,is.Date)
+importFrom(lubridate,is.POSIXt)
 importFrom(lubridate,is.difftime)
 importFrom(lubridate,is.period)
 importFrom(pillar,tbl_sum)
@@ -67,6 +69,13 @@ importFrom(purrr,map_lgl)
 importFrom(purrr,pluck)
 importFrom(purrr,pmap_chr)
 importFrom(purrr,some)
+importFrom(readr,parse_character)
+importFrom(readr,parse_date)
+importFrom(readr,parse_datetime)
+importFrom(readr,parse_double)
+importFrom(readr,parse_integer)
+importFrom(readr,parse_logical)
+importFrom(readr,parse_time)
 importFrom(rlang,"!!!")
 importFrom(rlang,.data)
 importFrom(rlang,abort)
@@ -117,4 +126,5 @@ importFrom(tidyselect,eval_select)
 importFrom(tidyselect,everything)
 importFrom(tidyselect,starts_with)
 importFrom(tidyselect,where)
+importFrom(vctrs,vec_ptype)
 importFrom(vctrs,vec_ptype_abbr)
diff --git a/R/REDCapTidieR-package.R b/R/REDCapTidieR-package.R
index d24682de..e4b3a88e 100644
--- a/R/REDCapTidieR-package.R
+++ b/R/REDCapTidieR-package.R
@@ -8,7 +8,7 @@
 #' left_join mutate pull recode relocate rename row_number select slice summarise
 #' @importFrom formattable percent
 #' @importFrom lobstr obj_size
-#' @importFrom lubridate is.difftime is.period
+#' @importFrom lubridate is.difftime is.period is.POSIXt is.Date
 #' @importFrom purrr compose map map2 map_int map_lgl pluck pmap_chr some
 #' @importFrom REDCapR redcap_arm_export redcap_event_instruments redcap_instruments
 #' redcap_metadata_read redcap_read_oneshot sanitize_token
@@ -23,8 +23,10 @@
 #' @importFrom tidyr complete fill pivot_wider nest unnest unnest_wider
 #' @importFrom tidyselect all_of any_of ends_with eval_select everything
 #' starts_with where
-#' @importFrom vctrs vec_ptype_abbr
+#' @importFrom vctrs vec_ptype_abbr vec_ptype
 #' @importFrom pillar tbl_sum
+#' @importFrom readr parse_logical parse_integer parse_double parse_date parse_time
+#' parse_datetime parse_character
 "_PACKAGE"
 
 ## usethis namespace: start
diff --git a/R/read_redcap.R b/R/read_redcap.R
index 93175fbb..4c1f627a 100644
--- a/R/read_redcap.R
+++ b/R/read_redcap.R
@@ -36,9 +36,10 @@
 #' "https://server.org/apps/redcap/api/"). Required.
 #' @param token The user-specific string that serves as the password for a
 #' project. Required.
-#' @param raw_or_label A string (either 'raw' or 'label') that specifies whether
+#' @param raw_or_label A string (either 'raw', 'label', or 'haven') that specifies whether
 #' to export the raw coded values or the labels for the options of categorical
-#' fields. Default is 'label'.
+#' fields. Default is 'label'. If 'haven' is supplied, categorical fields are converted
+#' to `haven_labelled` vectors.
 #' @param forms A character vector of REDCap instrument names that specifies
 #' which instruments to import. Default is `NULL` which imports all instruments
 #' in the project.
@@ -84,13 +85,17 @@ read_redcap <- function(redcap_uri,
   check_arg_is_character(redcap_uri, len = 1, any.missing = FALSE)
   check_arg_is_character(token, len = 1, any.missing = FALSE)
   check_arg_is_valid_token(token)
-  check_arg_choices(raw_or_label, choices = c("label", "raw"))
+  check_arg_choices(raw_or_label, choices = c("label", "raw", "haven"))
   check_arg_is_character(forms, min.len = 1, null.ok = TRUE, any.missing = FALSE)
   check_arg_is_logical(export_survey_fields, len = 1, any.missing = FALSE, null.ok = TRUE)
   check_arg_is_logical(export_data_access_groups, len = 1, any.missing = FALSE, null.ok = TRUE)
   check_arg_is_logical(suppress_redcapr_messages, len = 1, any.missing = FALSE)
   check_arg_is_logical(allow_mixed_structure, len = 1, any.missing = FALSE)
 
+  if (raw_or_label == "haven") {
+    check_installed("labelled", reason = "to use `read_redcap()` with `raw_or_label = 'haven'`")
+  }
+
   # Load REDCap Metadata ----
   # Capture unexpected metadata API call errors
   db_metadata <- try_redcapr({
@@ -251,8 +256,8 @@ read_redcap <- function(redcap_uri,
       filter(.data$field_name_updated %in% names(db_data))
   }
 
-  if (raw_or_label == "label") {
-    db_data <- multi_choice_to_labels(db_data, db_metadata)
+  if (raw_or_label != "raw") {
+    db_data <- multi_choice_to_labels(db_data, db_metadata, raw_or_label)
   }
 
   # Longitudinal Arms Check and Cleaning Application ----
diff --git a/R/utils.R b/R/utils.R
index c8c7cc23..792675da 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -397,10 +397,17 @@ update_data_col_names <- function(db_data, db_metadata) {
 #'
 #' @param db_data A REDCap database object
 #' @param db_metadata A REDCap metadata object
+#' @inheritParams read_redcap
 #'
 #' @keywords internal
 
-multi_choice_to_labels <- function(db_data, db_metadata) {
+multi_choice_to_labels <- function(db_data, db_metadata, raw_or_label = "label") {
+
+  if (raw_or_label == "label") {
+    label_handler <- apply_labs_factor
+  } else if (raw_or_label == "haven") {
+    label_handler <- apply_labs_haven
+  }
   # form_status_complete Column Handling ----
   # Must be done before the creation of form_status_complete
   # select columns that don't appear in field_name_updated and end with
@@ -411,28 +418,9 @@ multi_choice_to_labels <- function(db_data, db_metadata) {
 
   db_data <- db_data %>%
     mutate(
-      # Change double output of raw data to character
-      across(
-        .cols = all_of(form_status_cols),
-        .fns = ~ as.character(.)
-      ),
-      # Map constant values to raw values
-      across(
-        .cols = all_of(form_status_cols),
-        .fns = ~ case_when(
-          . == "0" ~ "Incomplete",
-          . == "1" ~ "Unverified",
-          . == "2" ~ "Complete"
-        )
-      ),
-      # Convert to factor
-      # Map constant values to raw values
       across(
         .cols = all_of(form_status_cols),
-        .fns = ~ factor(
-          .,
-          levels = c("Incomplete", "Unverified", "Complete")
-        )
+        .fns = ~ label_handler(., c("0" = "Incomplete", "1" = "Unverified", "2" = "Complete"), integer(0))
       )
     )
 
@@ -485,15 +473,120 @@ multi_choice_to_labels <- function(db_data, db_metadata) {
 
       # Replace values from db_data$(field_name) with label values from
       # parse_labels key
-      db_data[[field_name]] <- db_data[[field_name]] %>%
-        as.character() %>%
-        recode(!!!parse_labels_output) %>%
-        factor(levels = unique(parse_labels_output))
+
+      db_data[[field_name]] <- label_handler(
+        x = db_data[[field_name]],
+        labels = parse_labels_output,
+        ptype = db_data[[field_name]]
+      )
     }
   }
   db_data
 }
 
+#' @title
+#' Apply factor labels to a vector
+#'
+#' @param x a vector to label
+#' @param labels a named vector of labels in the format `c(value = label)`
+#' @param \dots unused
+#'
+#' @return
+#' factor
+#'
+#' @keywords internal
+apply_labs_factor <- function(x, labels, ...) {
+  as.character(x) %>%
+    recode(!!!labels) %>%
+    factor(levels = unique(labels))
+}
+
+#' @title
+#' Apply haven value labels to a vector
+#'
+#' @details
+#' Assumes a check_installed() has been run for `labelled`. Since `haven` preserves the
+#' underlying data values we need to make sure the data type of the value options in the metadata matches
+#' the data type of the values in the actual data. This function accepts a prototype, usually a column
+#' from db_data, and uses `force_cast()` to do a best-effort casting of the value options in the metadata
+#' to the same data type as `ptype`. The fallback is to convert `x` and the value labels to character.
+#'
+#' @param x a vector to label
+#' @param labels a named vector of labels in the format `c(value = label)`
+#' @param ptype vector to serve as prototype for label values
+#' @param \dots unused
+#'
+#' @return
+#' `haven_labelled` vector
+#'
+#' @keywords internal
+#'
+apply_labs_haven <- function(x, labels, ptype, ...) {
+  # set_value_labels expects labels in c(label = value) format so reverse them
+  labels <- invert_vec(labels)
+  # Try to cast values to match data type in data, catching any parsing warnings
+  cnd <- NULL
+  labels_cast <- withCallingHandlers({
+    force_cast(labels, ptype)
+  },
+  warning = function(w) {
+    cnd <- w
+    cnd_muffle(w)
+  })
+  if (!is.null(attr(labels_cast, "problems"))) {
+    # If there was parsing problem fall back to character
+    x <- as.character(x)
+    labels_cast <- force_cast(labels, character())
+  } else if (!is.null(cnd)) {
+    # If there was some other warning we didn't mean to catch it, so re-raise
+    cli_warn(cnd)
+  }
+
+  labelled::set_value_labels(x, .labels = labels_cast)
+}
+
+#' @title
+#' Swap vector names for values
+#'
+#' @param x a vector
+#'
+#' @return
+#' Vector with names and values reversed
+#'
+#' @keywords internal
+#'
+invert_vec <- function(x) {
+  out <- names(x)
+  # If there were no names do nothing
+  if (is.null(out)) {
+    return(x)
+  }
+  names(out) <- x
+  out
+}
+
+force_cast <- function(x, ptype) {
+  ptype <- vec_ptype(ptype)
+  if (is.logical(ptype)) {
+    out <- parse_logical(x)
+  } else if (is.integer(ptype)) {
+    out <- parse_integer(x)
+  } else if (is.numeric(ptype)) {
+    out <- parse_double(x)
+  } else if (is.Date(ptype)) {
+    out <- parse_date(x)
+  } else if (is.difftime(ptype)) {
+    out <- parse_time(x)
+  } else if (is.POSIXt(ptype)) {
+    out <- parse_datetime(x)
+  } else {
+    out <- parse_character(x)
+  }
+
+  names(out) <- names(x)
+  out
+}
+
 #' @title
 #' Utility function to extract the name of the project identifier field for
 #' a tibble of REDCap data
diff --git a/man/apply_labs_factor.Rd b/man/apply_labs_factor.Rd
new file mode 100644
index 00000000..b5e5f015
--- /dev/null
+++ b/man/apply_labs_factor.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{apply_labs_factor}
+\alias{apply_labs_factor}
+\title{Apply factor labels to a vector}
+\usage{
+apply_labs_factor(x, labels, ...)
+}
+\arguments{
+\item{x}{a vector to label}
+
+\item{labels}{a named vector of labels in the format \code{c(value = label)}}
+
+\item{\dots}{unused}
+}
+\value{
+factor
+}
+\description{
+Apply factor labels to a vector
+}
+\keyword{internal}
diff --git a/man/apply_labs_haven.Rd b/man/apply_labs_haven.Rd
new file mode 100644
index 00000000..ad35755d
--- /dev/null
+++ b/man/apply_labs_haven.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{apply_labs_haven}
+\alias{apply_labs_haven}
+\title{Apply haven value labels to a vector}
+\usage{
+apply_labs_haven(x, labels, ptype, ...)
+}
+\arguments{
+\item{x}{a vector to label}
+
+\item{labels}{a named vector of labels in the format \code{c(value = label)}}
+
+\item{ptype}{vector to serve as prototype for label values}
+
+\item{\dots}{unused}
+}
+\value{
+\code{haven_labelled} vector
+}
+\description{
+Apply haven value labels to a vector
+}
+\details{
+Assumes a check_installed() has been run for \code{labelled}. Since \code{haven} preserves the
+underlying data values we need to make sure the data type of the value options in the metadata matches
+the data type of the values in the actual data. This function accepts a prototype, usually a column
+from db_data, and uses \code{force_cast()} to do a best-effort casting of the value options in the metadata
+to the same data type as \code{ptype}. The fallback is to convert \code{x} and the value labels to character.
+}
+\keyword{internal}
diff --git a/man/invert_vec.Rd b/man/invert_vec.Rd
new file mode 100644
index 00000000..b85eb7ac
--- /dev/null
+++ b/man/invert_vec.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{invert_vec}
+\alias{invert_vec}
+\title{Swap vector names for values}
+\usage{
+invert_vec(x)
+}
+\arguments{
+\item{x}{a vector}
+}
+\value{
+Vector with names and values reversed
+}
+\description{
+Swap vector names for values
+}
+\keyword{internal}
diff --git a/man/multi_choice_to_labels.Rd b/man/multi_choice_to_labels.Rd
index 1a0f291e..2dca6b09 100644
--- a/man/multi_choice_to_labels.Rd
+++ b/man/multi_choice_to_labels.Rd
@@ -4,12 +4,17 @@
 \alias{multi_choice_to_labels}
 \title{Update multiple choice fields with label data}
 \usage{
-multi_choice_to_labels(db_data, db_metadata)
+multi_choice_to_labels(db_data, db_metadata, raw_or_label = "label")
 }
 \arguments{
 \item{db_data}{A REDCap database object}
 
 \item{db_metadata}{A REDCap metadata object}
+
+\item{raw_or_label}{A string (either 'raw', 'label', or 'haven') that specifies whether
+to export the raw coded values or the labels for the options of categorical
+fields. Default is 'label'. If 'haven' is supplied, categorical fields are converted
+to \code{haven_labelled} vectors.}
 }
 \description{
 Update REDCap variables with multi-choice types to standard form labels taken
diff --git a/man/read_redcap.Rd b/man/read_redcap.Rd
index 7cd5d2bb..469899f5 100644
--- a/man/read_redcap.Rd
+++ b/man/read_redcap.Rd
@@ -24,9 +24,10 @@ URI/URL of the REDCap server (e.g.,
 \item{token}{The user-specific string that serves as the password for a
 project. Required.}
 
-\item{raw_or_label}{A string (either 'raw' or 'label') that specifies whether
+\item{raw_or_label}{A string (either 'raw', 'label', or 'haven') that specifies whether
 to export the raw coded values or the labels for the options of categorical
-fields. Default is 'label'.}
+fields. Default is 'label'. If 'haven' is supplied, categorical fields are converted
+to \code{haven_labelled} vectors.}
 
 \item{forms}{A character vector of REDCap instrument names that specifies
 which instruments to import. Default is \code{NULL} which imports all instruments
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index 164dcf1e..beda46ad 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -69,6 +69,29 @@ test_that("multi_choice_to_labels works", {
 
   expect_factor(out$data_field_types_complete)
   expect_equal(levels(out$data_field_types_complete), c("Incomplete", "Unverified", "Complete"))
+
+  # Haven option works
+  skip_if_not_installed("labelled")
+
+  out <- multi_choice_to_labels(
+    db_data = db_data_classic,
+    db_metadata = db_metadata_classic,
+    raw_or_label = "haven"
+  ) %>%
+    suppressWarnings(classes = c(
+      "empty_parse_warning",
+      "field_missing_categories",
+      "duplicate_labels"
+    ))
+
+  expect_s3_class(out$dropdown_single, "haven_labelled")
+  expect_equal(labelled::val_labels(out$dropdown_single), c("one" = "choice_1", "two" = "choice_2", "three" = "choice_3"))
+  expect_s3_class(out$radio_single, "haven_labelled")
+  expect_equal(labelled::val_labels(out$radio_single), c("A" = "choice_1", "B" = "choice_2", "C" = "choice_3"))
+  expect_s3_class(out$data_field_types_complete, "haven_labelled")
+  expect_equal(labelled::val_labels(out$data_field_types_complete), c("Incomplete" = 0, "Unverified" = 1, "Complete" = 2))
+  expect_s3_class(out$repeatsurvey_radio_v2, "haven_labelled")
+  expect_equal(labelled::val_labels(out$repeatsurvey_radio_v2), c("Choice 1" = 1, "Choice 2" = 2, "Choice 3" = 3))
 })
 
 test_that("parse_labels works", {
@@ -399,3 +422,36 @@ test_that("make_skimr_labels works", {
   expect_true(is.character(skimr_labels))
   expect_true(!is.null(attr(skimr_labels, "name")))
 })
+
+test_that("invert_vec works", {
+  expect_equal(invert_vec(c("a" = 1, "b" = 2)), c("1" = "a", "2" = "b"))
+  expect_equal(invert_vec(1:2), 1:2)
+})
+
+test_that("apply_labs_haven works", {
+  skip_if_not_installed("labelled")
+
+  # Testing all classes where labelled has built in val_labels<- methods
+  out <- apply_labs_haven(1:3, c("1" = "a", "2" = "b", "3" = "c"), integer())
+  expect_s3_class(out, "haven_labelled")
+  expect_equal(labelled::val_labels(out), c(a = 1, b = 2, c = 3))
+
+  out <- apply_labs_haven(c(1.1, 2, 3.3), c("1.1" = "a", "2" = "b", "3.3" = "c"), numeric())
+  expect_s3_class(out, "haven_labelled")
+  expect_equal(labelled::val_labels(out), c(a = 1.1, b = 2.0, c = 3.3))
+
+  out <- apply_labs_haven(letters[1:3], c("a" = "x", "b" = "y", "c" = "z"), character())
+  expect_s3_class(out, "haven_labelled")
+  expect_equal(labelled::val_labels(out), c(x = "a", y = "b", z = "c"))
+
+  # Case with mismatching data types between labels from metadata and data values
+  out <- apply_labs_haven(1:2, c("1" = "a", "2" = "b", "3.3" = "c"), integer())
+  expect_s3_class(out, "haven_labelled")
+  expect_equal(labelled::val_labels(out), c(a = "1", b = "2", c = "3.3"))
+})
+
+test_that("apply_labs_factor works", {
+  out <- apply_labs_factor(1:3, c("1" = "a", "2" = "b", "3" = "c"))
+  expect_s3_class(out, "factor")
+  expect_equal(out, factor(letters[1:3]))
+})