Merge pull request #24 from DesiQuintans/new_verbs

Add new verbs to sift()
DesiQuintans · Jul 15, 2024 · d92b4ee · d92b4ee
2 parents d03152d + 943610f
commit d92b4ee
Show file tree

Hide file tree

Showing 17 changed files with 203 additions and 195 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: siftr
 Type: Package
 Title: Fuzzily Search a Dataframe to Find Relevant Columns
-Version: 1.1.1
+Version: 2.0.0
 Authors@R: c(
     person(given   = "Desi", 
            family  = "Quintans", 
@@ -10,7 +10,7 @@ Authors@R: c(
            comment = c(ORCID = "0000-0003-3356-0293")))
 Description: Analysts who change projects frequently know that it can be hard
     to find the right column in an unfamiliar dataframe, especially when the
-    dataframe spans hundreds of columns and millions of rows. 'siftr' is an
+    dataframe spans thousands of columns and millions of rows. 'siftr' is an
     interactive tool that finds relevant columns by fuzzily searching through
     a dataframe's column names, labels, factor levels, and unique values.
 License: MIT + file LICENSE
@@ -23,4 +23,6 @@ Depends:
 Imports:
     cli,
     fastdigest
-RoxygenNote: 7.2.3
+Suggests:
+    rlang
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,3 +6,6 @@ S3method(as.integer,haven_labelled)
 export(options_sift)
 export(save_dictionary)
 export(sift)
+export(sift.desc)
+export(sift.factors)
+export(sift.name)
diff --git a/R/exp_data_mtcars_lab.R b/R/exp_data_mtcars_lab.R
@@ -59,7 +59,7 @@
 
 # ---- This documents mtcars_lab -----------------------------------------------
 
-#' Labelled version of mtcars for testing `siftr`
+#' Labelled version of `mtcars` for testing `siftr`
 #'
 #' This is `mtcars` with value labels, variable labels (in `vs` only), some transformation
 #' to factor (`car` and `am`), an added Logical column (`above_avg`), an added column

diff --git a/R/exp_options_sift.R b/R/exp_options_sift.R
@@ -69,9 +69,10 @@ options_sift <- function(key = c("sift_limit", "sift_guessmax", "sift_peeklength
     }
 
     if ((key %in% names(default_setting)) == FALSE) {
-        cli::cli_abort(c(
-            "x" = msg_sift("not option", 1, key),
-            "i" = msg_sift("not option", 2, fold_or(names(default_setting)))
+        cli::cli_abort(
+            message = c(
+                "x" = "{.val {key}} is not one of {.pkg siftr}'s options.",
+                "i" = "Accepted options are: {.val {names(default_setting)}}."
             ))
     }
 

diff --git a/R/exp_save_dictionary.R b/R/exp_save_dictionary.R
@@ -36,7 +36,7 @@
 #'
 #' @examples
 #' \donttest{
-#' save_dictionary(CO2, path = tempdir())
+#' save_dictionary(mtcars_lab, path = tempdir())
 #' }
 #'
 #' @md

diff --git a/R/exp_sift.R b/R/exp_sift.R
@@ -1,21 +1,33 @@
 # sift() maintains its hash and dictionary lists in a closure, prooty fancy!
-closure.sift <- function() {
+closure.sift <- function(search_where = NULL) {
+    stopifnot(!is.null(search_where))
+
     current_hash <- list()  # Stores named hashes of dataframes.
     current_dict <- list()  # Stores named dataframe dictionaries.
 
 
-
     # ---- sift() begins here -------------------------
     s <- function(.df, ..., .dist = 0, .rebuild = FALSE) {
         # The df is passed to internal functions as a Char string, then evaluated later.
         df_name <- deparse(substitute(.df))
 
+
         # ---- 1. Ensure that df is a dataframe ------------------------------------------
 
+        # Does it exist at all?
+        tryCatch(is.data.frame(.df), error = function(e) {
+            cli::cli_abort(
+                message = c("No object named {.var {df_name}} was found.",
+                            " " = "Does it exist? Is its name correct?"),
+                call    = NULL)
+        })
+
+        # Is it a dataframe?
         if (!is.data.frame(.df)) {
-            cli::cli_alert_danger( msg_sift("not a df", 1, df_name))
-            cli::cli_alert_warning(msg_sift("not a df", 2))
-            cli::cat_line()
+            cli::cli_abort(
+                message = c("{.var {df_name}} is not a dataframe.",
+                            " " = "{.pkg siftr} only searches through dataframes."),
+                call    = NULL)
         }
 
 
@@ -33,44 +45,64 @@ closure.sift <- function() {
         dict <- current_dict[[df_name]]
 
 
-        # ---- 2. Convert ... into a query and perform a search --------------------------
+        # ---- 3. Shortcut exit if no search is requested --------------------------------
 
-        orig_query <- nse_dots(...)
+        orig_query  <- nse_dots(...)
 
         if (identical(orig_query, character(0))) {
             # If dots is empty, then return the dictionary itself.
 
-            cli::cli_alert_info(msg_sift("report dims", 1,
-                                         df_name,
-                                         length(unique(dict[["varname"]])),
-                                         fold_middle(dict[["varname"]], n = 50)),
-                                wrap = TRUE)
-            cli::cat_line()
+            # Report some of the variables in the dataframe.
+            num_cols   <- length(unique(dict[["varname"]]))
+            some_names <- fold_middle(dict[["varname"]], n = 50)
+            cli::cli_inform(
+                message = c("i" = "{.var {df_name}} has {num_cols} column{?s}:",
+                            " " = "{some_names}.")
+            )
 
             return(invisible(dict))
-        } else if (length(orig_query) == 1) {
+        }
+
+        # ---- 4. If a search is needed, do one ------------------------------------------
+
+        search_haystack <-
+            switch(
+                search_where,
+                "all"     = dict$haystack,  # Searches colnames, col label, factor levels, unique values.
+                "name"    = dict$varname,   # Searches colnames only.
+                "desc"    = dict$var_lab,   # Searches col label only.
+                "factors" = dict$lab_lvls   # Searches variable labels and factor labels.
+            )
+
+
+        if (length(orig_query) == 1) {
             # If dots has just one item in it, then treat it as an agrep() search, which
             # is possibly a regular expression.
 
             query <- orig_query
 
             candidates <-
-                agrep(query, dict$haystack, ignore.case = TRUE, value = FALSE,
+                agrep(query, search_haystack, ignore.case = TRUE, value = FALSE,
                       fixed = FALSE, max.distance = .dist)
         } else {
             # But if dots has more than one element, then use it to build a fuzzy search
             # with look-around.
 
             if (.dist > 0) {
-                cli::cli_warn(c("!" = msg_sift("dist_ignore", 1, .dist),
-                                "i" = msg_sift("dist_ignore", 2)))
+
+                cli::cli_inform(
+                    message = c(
+                        "An orderless search was performed, so {.arg .dist = {(.dist)}} was ignored.",
+                        "To remove this warning, either remove {.arg .dist} or provide your query as a single character string."
+                    )
+                )
             }
 
             query <- fuzzy_needle(orig_query)  # E.g. (?=.*gallon)(?=.*mileage)
 
             candidates <-
                 # Fuzzy needle requires PERL regex, which agrep and aregexc don't support.
-                grep(query, dict$haystack, ignore.case = TRUE, perl = TRUE)
+                grep(query, search_haystack, ignore.case = TRUE, perl = TRUE)
         }
 
 
@@ -89,38 +121,49 @@ closure.sift <- function() {
         # If there are no matches (integer(0)), this returns a dataframe with no rows.
         found <- dict[shown_candidates, ]
 
+        cli::cli_h1("Results for {.code {query}} in {.pkg {df_name}}")
+
+        if (length(shown_candidates) >= 1) {
+            # There's at least one match. I use display_row() to print every row of the return dataframe.
+            apply(X = found, MARGIN = 1, FUN = display_row)
+        }
+
+        cli::cli_h2("Summary")
+
         if (length(shown_candidates) < 1) {
             # No matches.
-            cli::cli_alert_danger(msg_sift("no matches", 1, query, .dist))
+
+            cli::cli_alert_danger("No matches found for query {.code {query}}.", wrap = TRUE)
 
             if (grepl("\`", query)) {
                 # Backtick was found. This can happen if a regex was passed as a bare name.
-                cli::cli_alert_info(msg_sift("no matches", 2), wrap = TRUE)
+                cli::cli_alert_info("If you're using a regular expression, pass it as a string.", wrap = TRUE)
             }
 
             if (length(orig_query) == 1) {
-                cli::cli_alert_info(msg_sift("no matches", 3, .dist), wrap = TRUE)
+                cli::cli_alert_info("Try increasing {.arg .dist = {(.dist)}} to allow more distant matches.", wrap = TRUE)
             }
-
-            cli::cat_line()
         } else {
-            # There's at least one match. I use display_row() to print every row of the return dataframe.
-            apply(X = found, MARGIN = 1, FUN = display_row)
+            # `n results`   = c("There %s %i result%s for query `%s`."),
+            #
+            # `over limit`  = c("Only %1$s of them %2$s printed, set by options_sift(\"sift_limit\", %1$s)"),
 
-            plur <- plural(length(shown_candidates))
-            cli::cat_line()
-            cli::cli_alert_success(msg_sift("n results", 1,
-                                            plur$were,
-                                            total_results,
-                                            plur$s, query))
+            cli::cli_alert_success("Found {total_results} result{?s} for query {.code {query}}.", wrap = TRUE)
 
             if (excess_results > 0) {
-                cli::cli_alert_warning(msg_sift("over limit", 1,
-                                                options_sift("sift_limit"),
-                                                plural(excess_results)$was))
+                cli::cli_alert_warning(
+                    "Only {length(shown_candidates)} of them {?was/were} printed, set by {.run options_sift(\"sift_limit\", {options_sift('sift_limit')})}.",
+                    wrap = TRUE
+                )
             }
+
+            cli::cli_alert_info(
+                "Use {.run View(.Last.value)} to view the full table of matches."
+            )
         }
 
+        cli::cat_line()
+
         # Return a dataframe of all results, not just the ones that were shown.
         return(invisible(dict[candidates, ]))
     }
@@ -176,6 +219,8 @@ closure.sift <- function() {
 #' - If the query was matched, only returns matching rows of the data dictionary.
 #' - If the query was not matched, return no rows of the dictionary (but all columns).
 #'
+#' @describeIn sift Search variable names, descriptive labels, factor labels, and unique values.
+#'
 #' @seealso [siftr::save_dictionary()], [siftr::options_sift()]
 #'
 #' @export
@@ -196,4 +241,31 @@ closure.sift <- function() {
 #' }
 #'
 #' @md
-sift <- closure.sift()
+sift <- closure.sift(search_where = "all")
+
+
+#' @describeIn sift Only search variable names (i.e. column names).
+#' @examples
+#' \donttest{
+#' sift.name(mtcars_lab, "car")  # Only searches variable names.
+#' }
+#' @export
+sift.name <- closure.sift(search_where = "name")
+
+
+#' @describeIn sift Only search the descriptive labels of variables.
+#' @examples
+#' \donttest{
+#' sift.desc(mtcars_lab, "car")  # Only searches variable descriptions.
+#' }
+#' @export
+sift.desc <- closure.sift(search_where = "desc")
+
+
+#' @describeIn sift Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types.
+#' @examples
+#' \donttest{
+#' sift.factors(mtcars_lab, "manual")  # Only searches factor levels and value labels.
+#' }
+#' @export
+sift.factors <- closure.sift(search_where = "factors")
diff --git a/R/prv_build_dictionary.R b/R/prv_build_dictionary.R
@@ -9,7 +9,9 @@ build_dictionary <- function(DF, dictlist) {
     df_name <- DF
     DF <- eval(as.symbol(DF))
 
-    cli::cli_alert_info(msg_sift("building", 1, df_name), wrap = TRUE)
+    cli::cli_inform(
+        message = c("i" = "Building the dictionary for {.var {df_name}}...")
+    )
 
     start_time <- Sys.time()
 
@@ -19,6 +21,16 @@ build_dictionary <- function(DF, dictlist) {
     raw_var_labs <- sapply(DF, attr, "label")
     raw_val_labs <- sapply(DF, function(col) { names(attr(col, "labels")) })  # The names are what I want.
     raw_fct_lvls <- sapply(DF, levels)
+    # `raw_lab_lvls` combines the value labels and factor levels into one thing. With SAS datasets, for example,
+    # `haven` imports factors as labelled variables and not as proper factors.
+    raw_lab_lvls <- mapply(
+        function(x, y) {
+            result <- unique(c(x, y))
+            result[is.null(result)] <- ""
+            return(result)
+        },
+        raw_fct_lvls, raw_val_labs, SIMPLIFY = FALSE)
+
 
     # Extra details for the data dictionary
     dct_type_strs <- sapply(DF, coltype)
@@ -30,11 +42,12 @@ build_dictionary <- function(DF, dictlist) {
     dct_all_same  <- sapply(DF, invariant)
 
     # Getting labels into vectors of length 1.
-    var_labs <- esc_braces(crunch(raw_var_labs))
-    val_labs <- esc_braces(crunch(raw_val_labs))
-    fct_lvls <- esc_braces(crunch(raw_fct_lvls))
+    var_labs  <- esc_braces(crunch(raw_var_labs))
+    val_labs  <- esc_braces(crunch(raw_val_labs))
+    fct_lvls  <- esc_braces(crunch(raw_fct_lvls))
+    labs_lvls <- esc_braces(crunch(raw_lab_lvls))
     # Those labels and unique values joined together to make searchable strings.
-    haystacks <- smash(raw_varnames, var_labs, val_labs, fct_lvls, dct_rand_uniq)
+    haystacks <- smash(raw_varnames, var_labs, labs_lvls, dct_rand_uniq)
 
     dictionary <-
         data.frame(
@@ -48,7 +61,9 @@ build_dictionary <- function(DF, dictlist) {
             all_same    = dct_all_same,
             val_lab     = codify(raw_val_labs),
             fct_lvl     = codify(raw_fct_lvls),
+            labs_lvls   = labs_lvls,
             fct_ordered = dct_ordered,
+
             class       = codify(dct_classes),
             type        = codify(dct_types),
             haystack    = haystacks,
@@ -61,7 +76,9 @@ build_dictionary <- function(DF, dictlist) {
     elapsed <- round(end_time - start_time, digits = 2)
     elapsed_str <- paste(elapsed, attr(elapsed, "units"))
 
-    cli::cli_alert_success(msg_sift("built", 1, elapsed_str))
+    cli::cli_inform(
+        message = c("v" = "Dictionary was built in {elapsed_str}."),
+    )
     cli::cat_line()
 
     return(invisible(dictlist))

diff --git a/R/prv_msg_sift.R b/R/prv_msg_sift.R