From e4c101a9645e436b9406be0b89cc7a11ae89c70b Mon Sep 17 00:00:00 2001 From: Desi Joshua Quintans Date: Sun, 14 Jul 2024 15:35:56 +1000 Subject: [PATCH 1/3] Improve formatting of colnames printout --- R/exp_sift.R | 76 +++++++++++++++++++++++++++++++++------- R/prv_build_dictionary.R | 8 +++-- R/prv_msg_sift.R | 7 ---- R/prv_private_funs.R | 4 +-- 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/R/exp_sift.R b/R/exp_sift.R index 62f1613..c42f875 100644 --- a/R/exp_sift.R +++ b/R/exp_sift.R @@ -1,21 +1,31 @@ # sift() maintains its hash and dictionary lists in a closure, prooty fancy! -closure.sift <- function() { +closure.sift <- function(search_where) { current_hash <- list() # Stores named hashes of dataframes. current_dict <- list() # Stores named dataframe dictionaries. - # ---- sift() begins here ------------------------- s <- function(.df, ..., .dist = 0, .rebuild = FALSE) { # The df is passed to internal functions as a Char string, then evaluated later. df_name <- deparse(substitute(.df)) + # ---- 1. Ensure that df is a dataframe ------------------------------------------ + # Does it exist at all? + tryCatch(is.data.frame(.df), error = function(e) { + cli::cli_abort( + message = c("No object named {.var {df_name}} was found.", + " " = "Does it exist? Is its name correct?"), + call = NULL) + }) + + # Is it a dataframe? if (!is.data.frame(.df)) { - cli::cli_alert_danger( msg_sift("not a df", 1, df_name)) - cli::cli_alert_warning(msg_sift("not a df", 2)) - cli::cat_line() + cli::cli_abort( + message = c("{.var {df_name}} is not a dataframe.", + " " = "{.pkg siftr} only searches through dataframes."), + call = NULL) } @@ -33,24 +43,64 @@ closure.sift <- function() { dict <- current_dict[[df_name]] - # ---- 2. Convert ... into a query and perform a search -------------------------- + # ---- 3. Shortcut exit if no search is requested -------------------------------- - orig_query <- nse_dots(...) + orig_query <- nse_dots(...) if (identical(orig_query, character(0))) { # If dots is empty, then return the dictionary itself. - cli::cli_alert_info(msg_sift("report dims", 1, - df_name, - length(unique(dict[["varname"]])), - fold_middle(dict[["varname"]], n = 50)), - wrap = TRUE) - cli::cat_line() + # Report some of the variables in the dataframe. + num_cols <- length(unique(dict[["varname"]])) + some_names <- fold_middle(dict[["varname"]], n = 50) + cli::cli_inform( + message = c("i" = "{.var {df_name}} has {num_cols} column{?s}:", + " " = "{some_names}.") + ) return(invisible(dict)) + } + + # ---- 4. If a search is needed, do one ------------------------------------------ + + orig_query <- nse_dots(...) + + # TAIL: search_where: near, name, desc, factors, haystack + + search_func <- character(0) + + if (identical(orig_query, character(0))) { + # If dots is empty, then no search is performed. + query <- orig_query + } else if (length(orig_query) == 1) { # If dots has just one item in it, then treat it as an agrep() search, which # is possibly a regular expression. + search_func <- "agrep" + query <- orig_query + + } else { + # If dots has more than one element, then use it to build a fuzzy search + # with look-around. + + if (.dist > 0) { + cli::cli_warn(c("!" = msg_sift("dist_ignore", 1, .dist), + "i" = msg_sift("dist_ignore", 2))) + } + + # Fuzzy needle requires PERL regex, which agrep and aregexc don't support. + search_func <- "grep" + query <- fuzzy_needle(orig_query) # E.g. (?=.*gallon)(?=.*mileage) + } + + + + + # ---- 2. Convert ... into a query ----------------------------------------------- + + if (length(orig_query) == 1) { + # If dots has just one item in it, then treat it as an agrep() search, which + # is possibly a regular expression. query <- orig_query diff --git a/R/prv_build_dictionary.R b/R/prv_build_dictionary.R index 5556595..8c37945 100644 --- a/R/prv_build_dictionary.R +++ b/R/prv_build_dictionary.R @@ -9,7 +9,9 @@ build_dictionary <- function(DF, dictlist) { df_name <- DF DF <- eval(as.symbol(DF)) - cli::cli_alert_info(msg_sift("building", 1, df_name), wrap = TRUE) + cli::cli_inform( + message = c("i" = "Building the dictionary for {.var {df_name}}...") + ) start_time <- Sys.time() @@ -61,7 +63,9 @@ build_dictionary <- function(DF, dictlist) { elapsed <- round(end_time - start_time, digits = 2) elapsed_str <- paste(elapsed, attr(elapsed, "units")) - cli::cli_alert_success(msg_sift("built", 1, elapsed_str)) + cli::cli_inform( + message = c("v" = "Dictionary was built in {elapsed_str}."), + ) cli::cat_line() return(invisible(dictlist)) diff --git a/R/prv_msg_sift.R b/R/prv_msg_sift.R index acdbbdd..d87059f 100644 --- a/R/prv_msg_sift.R +++ b/R/prv_msg_sift.R @@ -12,19 +12,12 @@ # @md msg_sift <- function(entry, i = 1, ...) { text <- list( - `not a df` = c("'%s' is not a dataframe.", - "sift() only searches through dataframes."), - `report dims` = c("'%s' has %i columns: %s."), `no matches` = c("No matches found for query '%s' with .dist = %.2f.", "If you're using a regular expression, pass it as a string.", "Try increasing '.dist = %.2f' to allow more distant matches."), - `building` = c("Building dictionary for '%s'. This only happens when it changes."), - - `built` = c("Dictionary was built in %s."), - `n results` = c("There %s %i result%s for query `%s`."), `over limit` = c("Only %1$s of them %2$s printed, set by options_sift(\"sift_limit\", %1$s)"), diff --git a/R/prv_private_funs.R b/R/prv_private_funs.R index e74ee36..543617d 100644 --- a/R/prv_private_funs.R +++ b/R/prv_private_funs.R @@ -58,9 +58,9 @@ fold_middle <- function(vec, n = 2) { head_idx <- 1:num_head tail_idx <- (length(vec) - (num_tail - 1)):length(vec) - sprintf("%s ... [%i skipped] ... %s", + sprintf("%s...\f[ Skipping %s ]\f...%s", paste(vec[head_idx], collapse = ", "), - length(vec) - n, + cli::pluralize("{length(vec) - n} column{?s}"), paste(vec[tail_idx], collapse = ", ")) } From d1e76c0b79ae6359a7e21e92f14bd4ac3b210c0f Mon Sep 17 00:00:00 2001 From: Desi Joshua Quintans Date: Mon, 15 Jul 2024 17:17:07 +1000 Subject: [PATCH 2/3] Add subsearch options to sift() --- DESCRIPTION | 8 ++- NAMESPACE | 3 + R/exp_data_mtcars_lab.R | 2 +- R/exp_options_sift.R | 7 ++- R/exp_save_dictionary.R | 2 +- R/exp_sift.R | 132 +++++++++++++++++++++++---------------- R/prv_build_dictionary.R | 21 +++++-- R/prv_msg_sift.R | 32 ---------- R/prv_private_funs.R | 77 +---------------------- R/prv_should_approx.R | 11 ++-- R/prv_show_display_row.R | 4 +- R/prv_summary_funcs.R | 18 +++--- man/mtcars_lab.Rd | 2 +- man/save_dictionary.Rd | 2 +- man/sift.Rd | 29 +++++++++ man/siftr-package.Rd | 2 +- 16 files changed, 156 insertions(+), 196 deletions(-) delete mode 100644 R/prv_msg_sift.R diff --git a/DESCRIPTION b/DESCRIPTION index 7db68a1..b73458d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: siftr Type: Package Title: Fuzzily Search a Dataframe to Find Relevant Columns -Version: 1.1.1 +Version: 2.0.0 Authors@R: c( person(given = "Desi", family = "Quintans", @@ -10,7 +10,7 @@ Authors@R: c( comment = c(ORCID = "0000-0003-3356-0293"))) Description: Analysts who change projects frequently know that it can be hard to find the right column in an unfamiliar dataframe, especially when the - dataframe spans hundreds of columns and millions of rows. 'siftr' is an + dataframe spans thousands of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through a dataframe's column names, labels, factor levels, and unique values. License: MIT + file LICENSE @@ -23,4 +23,6 @@ Depends: Imports: cli, fastdigest -RoxygenNote: 7.2.3 +Suggests: + rlang +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 6e3db00..5e6ee0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,3 +6,6 @@ S3method(as.integer,haven_labelled) export(options_sift) export(save_dictionary) export(sift) +export(sift.desc) +export(sift.factors) +export(sift.name) diff --git a/R/exp_data_mtcars_lab.R b/R/exp_data_mtcars_lab.R index 2060068..28e53a5 100644 --- a/R/exp_data_mtcars_lab.R +++ b/R/exp_data_mtcars_lab.R @@ -59,7 +59,7 @@ # ---- This documents mtcars_lab ----------------------------------------------- -#' Labelled version of mtcars for testing `siftr` +#' Labelled version of `mtcars` for testing `siftr` #' #' This is `mtcars` with value labels, variable labels (in `vs` only), some transformation #' to factor (`car` and `am`), an added Logical column (`above_avg`), an added column diff --git a/R/exp_options_sift.R b/R/exp_options_sift.R index 1ec1f31..03e8ddb 100644 --- a/R/exp_options_sift.R +++ b/R/exp_options_sift.R @@ -69,9 +69,10 @@ options_sift <- function(key = c("sift_limit", "sift_guessmax", "sift_peeklength } if ((key %in% names(default_setting)) == FALSE) { - cli::cli_abort(c( - "x" = msg_sift("not option", 1, key), - "i" = msg_sift("not option", 2, fold_or(names(default_setting))) + cli::cli_abort( + message = c( + "x" = "{.val {key}} is not one of {.pkg siftr}'s options.", + "i" = "Accepted options are: {.val {names(default_setting)}}." )) } diff --git a/R/exp_save_dictionary.R b/R/exp_save_dictionary.R index 548e867..4fb670d 100644 --- a/R/exp_save_dictionary.R +++ b/R/exp_save_dictionary.R @@ -36,7 +36,7 @@ #' #' @examples #' \donttest{ -#' save_dictionary(CO2, path = tempdir()) +#' save_dictionary(mtcars_lab, path = tempdir()) #' } #' #' @md diff --git a/R/exp_sift.R b/R/exp_sift.R index c42f875..341a54e 100644 --- a/R/exp_sift.R +++ b/R/exp_sift.R @@ -1,5 +1,7 @@ # sift() maintains its hash and dictionary lists in a closure, prooty fancy! -closure.sift <- function(search_where) { +closure.sift <- function(search_where = NULL) { + stopifnot(!is.null(search_where)) + current_hash <- list() # Stores named hashes of dataframes. current_dict <- list() # Stores named dataframe dictionaries. @@ -63,40 +65,15 @@ closure.sift <- function(search_where) { # ---- 4. If a search is needed, do one ------------------------------------------ - orig_query <- nse_dots(...) - - # TAIL: search_where: near, name, desc, factors, haystack - - search_func <- character(0) - - if (identical(orig_query, character(0))) { - # If dots is empty, then no search is performed. - query <- orig_query - - } else if (length(orig_query) == 1) { - # If dots has just one item in it, then treat it as an agrep() search, which - # is possibly a regular expression. - search_func <- "agrep" - query <- orig_query - - } else { - # If dots has more than one element, then use it to build a fuzzy search - # with look-around. - - if (.dist > 0) { - cli::cli_warn(c("!" = msg_sift("dist_ignore", 1, .dist), - "i" = msg_sift("dist_ignore", 2))) - } - - # Fuzzy needle requires PERL regex, which agrep and aregexc don't support. - search_func <- "grep" - query <- fuzzy_needle(orig_query) # E.g. (?=.*gallon)(?=.*mileage) - } - - - + search_haystack <- + switch( + search_where, + "all" = dict$haystack, # Searches colnames, col label, factor levels, unique values. + "name" = dict$varname, # Searches colnames only. + "desc" = dict$var_lab, # Searches col label only. + "factors" = dict$lab_lvls # Searches variable labels and factor labels. + ) - # ---- 2. Convert ... into a query ----------------------------------------------- if (length(orig_query) == 1) { # If dots has just one item in it, then treat it as an agrep() search, which @@ -105,22 +82,27 @@ closure.sift <- function(search_where) { query <- orig_query candidates <- - agrep(query, dict$haystack, ignore.case = TRUE, value = FALSE, + agrep(query, search_haystack, ignore.case = TRUE, value = FALSE, fixed = FALSE, max.distance = .dist) } else { # But if dots has more than one element, then use it to build a fuzzy search # with look-around. if (.dist > 0) { - cli::cli_warn(c("!" = msg_sift("dist_ignore", 1, .dist), - "i" = msg_sift("dist_ignore", 2))) + + cli::cli_inform( + message = c( + "An orderless search was performed, so {.arg .dist = {(.dist)}} was ignored.", + "To remove this warning, either remove {.arg .dist} or provide your query as a single character string." + ) + ) } query <- fuzzy_needle(orig_query) # E.g. (?=.*gallon)(?=.*mileage) candidates <- # Fuzzy needle requires PERL regex, which agrep and aregexc don't support. - grep(query, dict$haystack, ignore.case = TRUE, perl = TRUE) + grep(query, search_haystack, ignore.case = TRUE, perl = TRUE) } @@ -139,38 +121,49 @@ closure.sift <- function(search_where) { # If there are no matches (integer(0)), this returns a dataframe with no rows. found <- dict[shown_candidates, ] + cli::cli_h1("Results for {.code {query}} in {.pkg {df_name}}") + + if (length(shown_candidates) >= 1) { + # There's at least one match. I use display_row() to print every row of the return dataframe. + apply(X = found, MARGIN = 1, FUN = display_row) + } + + cli::cli_h2("Summary") + if (length(shown_candidates) < 1) { # No matches. - cli::cli_alert_danger(msg_sift("no matches", 1, query, .dist)) + + cli::cli_alert_danger("No matches found for query {.code {query}}.", wrap = TRUE) if (grepl("\`", query)) { # Backtick was found. This can happen if a regex was passed as a bare name. - cli::cli_alert_info(msg_sift("no matches", 2), wrap = TRUE) + cli::cli_alert_info("If you're using a regular expression, pass it as a string.", wrap = TRUE) } if (length(orig_query) == 1) { - cli::cli_alert_info(msg_sift("no matches", 3, .dist), wrap = TRUE) + cli::cli_alert_info("Try increasing {.arg .dist = {(.dist)}} to allow more distant matches.", wrap = TRUE) } - - cli::cat_line() } else { - # There's at least one match. I use display_row() to print every row of the return dataframe. - apply(X = found, MARGIN = 1, FUN = display_row) + # `n results` = c("There %s %i result%s for query `%s`."), + # + # `over limit` = c("Only %1$s of them %2$s printed, set by options_sift(\"sift_limit\", %1$s)"), - plur <- plural(length(shown_candidates)) - cli::cat_line() - cli::cli_alert_success(msg_sift("n results", 1, - plur$were, - total_results, - plur$s, query)) + cli::cli_alert_success("Found {total_results} result{?s} for query {.code {query}}.", wrap = TRUE) if (excess_results > 0) { - cli::cli_alert_warning(msg_sift("over limit", 1, - options_sift("sift_limit"), - plural(excess_results)$was)) + cli::cli_alert_warning( + "Only {length(shown_candidates)} of them {?was/were} printed, set by {.run options_sift(\"sift_limit\", {options_sift('sift_limit')})}.", + wrap = TRUE + ) } + + cli::cli_alert_info( + "Use {.run View(.Last.value)} to view the full table of matches." + ) } + cli::cat_line() + # Return a dataframe of all results, not just the ones that were shown. return(invisible(dict[candidates, ])) } @@ -226,6 +219,8 @@ closure.sift <- function(search_where) { #' - If the query was matched, only returns matching rows of the data dictionary. #' - If the query was not matched, return no rows of the dictionary (but all columns). #' +#' @describeIn sift Search variable names, descriptive labels, factor labels, and unique values. +#' #' @seealso [siftr::save_dictionary()], [siftr::options_sift()] #' #' @export @@ -246,4 +241,31 @@ closure.sift <- function(search_where) { #' } #' #' @md -sift <- closure.sift() +sift <- closure.sift(search_where = "all") + + +#' @describeIn sift Only search variable names (i.e. column names). +#' @examples +#' \donttest{ +#' sift.name(mtcars_lab, "car") # Only searches variable names. +#' } +#' @export +sift.name <- closure.sift(search_where = "name") + + +#' @describeIn sift Only search the descriptive labels of variables. +#' @examples +#' \donttest{ +#' sift.desc(mtcars_lab, "car") # Only searches variable descriptions. +#' } +#' @export +sift.desc <- closure.sift(search_where = "desc") + + +#' @describeIn sift Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types. +#' @examples +#' \donttest{ +#' sift.factors(mtcars_lab, "manual") # Only searches factor levels and value labels. +#' } +#' @export +sift.factors <- closure.sift(search_where = "factors") diff --git a/R/prv_build_dictionary.R b/R/prv_build_dictionary.R index 8c37945..956c6f3 100644 --- a/R/prv_build_dictionary.R +++ b/R/prv_build_dictionary.R @@ -21,6 +21,16 @@ build_dictionary <- function(DF, dictlist) { raw_var_labs <- sapply(DF, attr, "label") raw_val_labs <- sapply(DF, function(col) { names(attr(col, "labels")) }) # The names are what I want. raw_fct_lvls <- sapply(DF, levels) + # `raw_lab_lvls` combines the value labels and factor levels into one thing. With SAS datasets, for example, + # `haven` imports factors as labelled variables and not as proper factors. + raw_lab_lvls <- mapply( + function(x, y) { + result <- unique(c(x, y)) + result[is.null(result)] <- "" + return(result) + }, + raw_fct_lvls, raw_val_labs, SIMPLIFY = FALSE) + # Extra details for the data dictionary dct_type_strs <- sapply(DF, coltype) @@ -32,11 +42,12 @@ build_dictionary <- function(DF, dictlist) { dct_all_same <- sapply(DF, invariant) # Getting labels into vectors of length 1. - var_labs <- esc_braces(crunch(raw_var_labs)) - val_labs <- esc_braces(crunch(raw_val_labs)) - fct_lvls <- esc_braces(crunch(raw_fct_lvls)) + var_labs <- esc_braces(crunch(raw_var_labs)) + val_labs <- esc_braces(crunch(raw_val_labs)) + fct_lvls <- esc_braces(crunch(raw_fct_lvls)) + labs_lvls <- esc_braces(crunch(raw_lab_lvls)) # Those labels and unique values joined together to make searchable strings. - haystacks <- smash(raw_varnames, var_labs, val_labs, fct_lvls, dct_rand_uniq) + haystacks <- smash(raw_varnames, var_labs, labs_lvls, dct_rand_uniq) dictionary <- data.frame( @@ -50,7 +61,9 @@ build_dictionary <- function(DF, dictlist) { all_same = dct_all_same, val_lab = codify(raw_val_labs), fct_lvl = codify(raw_fct_lvls), + labs_lvls = labs_lvls, fct_ordered = dct_ordered, + class = codify(dct_classes), type = codify(dct_types), haystack = haystacks, diff --git a/R/prv_msg_sift.R b/R/prv_msg_sift.R deleted file mode 100644 index d87059f..0000000 --- a/R/prv_msg_sift.R +++ /dev/null @@ -1,32 +0,0 @@ - -# User-facing messages are collected here for easy editing/tone consistency. -# -# @param entry (String) The name of one of the entries below. -# @param i (Integer) The index/line to return from the entry. -# @param ... (Dots) Arguments that will be passed to [sprintf()] to fill the placeholders -# in the messages. -# -# @return A named list of character vectors. -# @examples -# # msg_sift("not a df", 1, "month.abb") -# @md -msg_sift <- function(entry, i = 1, ...) { - text <- list( - `report dims` = c("'%s' has %i columns: %s."), - - `no matches` = c("No matches found for query '%s' with .dist = %.2f.", - "If you're using a regular expression, pass it as a string.", - "Try increasing '.dist = %.2f' to allow more distant matches."), - - `n results` = c("There %s %i result%s for query `%s`."), - - `over limit` = c("Only %1$s of them %2$s printed, set by options_sift(\"sift_limit\", %1$s)"), - - `not option` = c("'%s' is not one of sift's options.", - "Did you mean %s?"), - - `dist_ignore` = c("An orderless search was performed, so '.dist = %.2f' was ignored.", - "To remove this warning, either remove '.dist' or provide your query as a single character string.") - ) - return(sprintf(text[[entry]][i], ...)) -} diff --git a/R/prv_private_funs.R b/R/prv_private_funs.R index 543617d..52a59c6 100644 --- a/R/prv_private_funs.R +++ b/R/prv_private_funs.R @@ -26,23 +26,8 @@ codify <- function(x, rev = FALSE) { } -# For a multi-element vector, print the first n unique items and announce how many -# others remain. -# If n > length(vec), just print the whole thing. -fold <- function(vec, n = 2) { - vec <- unique(vec) - - items <- vec[1:n] - items <- items[!is.na(items)] - - remain <- sum(!match(vec, items, nomatch = FALSE), na.rm = TRUE) - remain_str <- ifelse(remain > 0, sprintf(", and %i more", remain), "") - - paste0(paste(items, collapse = ", "), remain_str) -} - - - +# Given a vector with many elements, prints some from the head, some from the +# tail, and skips the excess in the middle. fold_middle <- function(vec, n = 2) { if (n < 2) { n <- 2 # A 'middle' needs to exist. @@ -65,31 +50,11 @@ fold_middle <- function(vec, n = 2) { } - -# For a multi-element vector 1:4, report it as "1, 2, 3, or 4". -fold_or <- function(vec, word = "or") { - v <- as.character(vec) - v[length(v)] <- paste(word, v[length(v)]) - return(paste(v, collapse = ", ")) -} - - - # Turn a list of words into a fuzzy regex # # A fuzzy regex is one that will match search terms in any order by using PERL # lookaround. This can be very slow, but is often worth the cost to get more # complete results. -# -# @param vec (Character) A string containing space-separated keywords to search for. -# -# @return A string where each word has been wrapped as a lookaround term. -# -# @examples -# \dontrun{ -# fuzzy_needle("network centrality") -# #> [1] "(?=.*network)(?=.*centrality)" -# } fuzzy_needle <- function(vec) { words <- unique(unlist(strsplit(vec, "\\s+"))) @@ -104,44 +69,6 @@ has_class <- function(obj, classname) { } -# Truncate long strings with ellipsis -# shorten(state.name, 7) -shorten <- function(x, width) { - x <- as.character(x) - is_long <- nchar(x) > width - # Makes room for ellipsis - x[is_long] <- paste0(substr(x[is_long], 1, width - 1), - cli::symbol["ellipsis"]) - - return(x) -} - - - - - - -# Plural forms -plural <- function(n) { - if (n == 1) { - other <- "other" - s <- "" - was <- "was" - } else { - other <- "others" - s <- "s" - was <- "were" - } - - return(invisible(list(other = other, - others = other, - s = s, - was = was, - were = was))) -} - - - # Escape braces in glue() by doubling them # https://glue.tidyverse.org/#a-literal-brace-is-inserted-by-using-doubled-braces esc_braces <- function(str) { diff --git a/R/prv_should_approx.R b/R/prv_should_approx.R index a3af774..4995a1a 100644 --- a/R/prv_should_approx.R +++ b/R/prv_should_approx.R @@ -7,13 +7,12 @@ # is small enough to simply use, or whether it's so big that randomly sampling from it # would be more efficient. # -# @param x (Vector) The vector to sample. +# x (Vector) The vector to sample. # -# @return A named list with 3 entries: `indices` contains chosen indices of `x`; `marker` -# is a text marker that indicates whether `x` was sampled. `is_approx` is `TRUE` if -# `x` was sampled, and `FALSE` if all indices of `x` were returned. -# -# @md +# Returns: A named list with 3 entries: `indices` contains chosen indices of +# `x`; `marker` is a text marker that indicates whether `x` was sampled. +# `is_approx` is `TRUE` if `x` was sampled, and `FALSE` if all indices +# of `x` were returned. should_approx <- function(x) { if (length(x) > options_sift("sift_guessmax")) { x <- sort(sample.int(n = length(x), diff --git a/R/prv_show_display_row.R b/R/prv_show_display_row.R index 6a5486b..cb814fa 100644 --- a/R/prv_show_display_row.R +++ b/R/prv_show_display_row.R @@ -3,10 +3,10 @@ # # This function controls how results from the data dictionary # -# @param dr (Character) A named Character vector which is one extracted row +# dr (Character) A named Character vector which is one extracted row # of the data dictionary (as sent to this function by apply() in sift()). # -# @return Side-effect of printing to the terminal. +# Returns nothing, with the side-effect of printing to the terminal. display_row <- function(dr) { # This should check whether the summary stat is NA/NULL, and if it's not, then print it. # This means that control of what is displayed from the dictionary is held here, and diff --git a/R/prv_summary_funcs.R b/R/prv_summary_funcs.R index 645edc2..22ea2a0 100644 --- a/R/prv_summary_funcs.R +++ b/R/prv_summary_funcs.R @@ -1,13 +1,11 @@ # Fast approximate summary functions # Randomly choose and show some unique values from a variable. -# # This does some clever stuff depending on the class of the variable. # -# @param x (Vector) A vector. +# x (Vector) A vector. # -# @return A Character string. -# @md +# Returns a Character string. some_uniques <- function(x) { # ---- Special cases ------------------------------------------------------- @@ -113,11 +111,10 @@ some_uniques <- function(x) { # it still has to run on each column, and there may be hundreds of those. It # therefore randomly samples the vector once it reaches `sift_guessmax`'s limit. # -# @param x (Vector) A vector. -# @param na.rm (Logical) If `TRUE`, remove `NA`s. +# x (Vector) A vector. +# na.rm (Logical) If `TRUE`, remove `NA`s. # -# @return A logical. -# @md +# Returns a logical. invariant <- function(x) { # Source: https://stackoverflow.com/a/59067398/5578429 @@ -148,10 +145,9 @@ invariant <- function(x) { # objects with class `haven_labelled` have an underlying type, and both the # class and type are important to show. # -# @param x (Vector) A vector. +# x (Vector) A vector. # -# @return Character. -# @md +# Returns a Character. coltype <- function(x) { # class typeof # diff --git a/man/mtcars_lab.Rd b/man/mtcars_lab.Rd index 8449630..b1c3d65 100644 --- a/man/mtcars_lab.Rd +++ b/man/mtcars_lab.Rd @@ -3,7 +3,7 @@ \docType{data} \name{mtcars_lab} \alias{mtcars_lab} -\title{Labelled version of mtcars for testing \code{siftr}} +\title{Labelled version of \code{mtcars} for testing \code{siftr}} \format{ A dataframe with 15 columns and 32 rows. } diff --git a/man/save_dictionary.Rd b/man/save_dictionary.Rd index 46827d7..c8381a7 100644 --- a/man/save_dictionary.Rd +++ b/man/save_dictionary.Rd @@ -45,7 +45,7 @@ you need to relabel it every time. } \examples{ \donttest{ -save_dictionary(CO2, path = tempdir()) +save_dictionary(mtcars_lab, path = tempdir()) } } diff --git a/man/sift.Rd b/man/sift.Rd index 39be45b..91db26f 100644 --- a/man/sift.Rd +++ b/man/sift.Rd @@ -2,9 +2,18 @@ % Please edit documentation in R/exp_sift.R \name{sift} \alias{sift} +\alias{sift.name} +\alias{sift.desc} +\alias{sift.factors} \title{Find relevant variables in a dataframe using fuzzy searches} \usage{ sift(.df, ..., .dist = 0, .rebuild = FALSE) + +sift.name(.df, ..., .dist = 0, .rebuild = FALSE) + +sift.desc(.df, ..., .dist = 0, .rebuild = FALSE) + +sift.factors(.df, ..., .dist = 0, .rebuild = FALSE) } \arguments{ \item{.df}{(Dataframe) A dataframe to search through.} @@ -60,6 +69,17 @@ The search that's performed depends on \code{...} and \code{.dist}: is ignored in orderless searching. } } +\section{Functions}{ +\itemize{ +\item \code{sift()}: Search variable names, descriptive labels, factor labels, and unique values. + +\item \code{sift.name()}: Only search variable names (i.e. column names). + +\item \code{sift.desc()}: Only search the descriptive labels of variables. + +\item \code{sift.factors()}: Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types. + +}} \examples{ \donttest{ sift(mtcars_lab) # Builds a dictionary without searching. @@ -75,6 +95,15 @@ sift(mtcars_lab, "date|time") # Regular expression sift(mtcars_lab, "cyl|gear", number) # Orderless search with regular expression } +\donttest{ +sift.name(mtcars_lab, "car") # Only searches variable names. +} +\donttest{ +sift.desc(mtcars_lab, "car") # Only searches variable descriptions. +} +\donttest{ +sift.factors(mtcars_lab, "manual") # Only searches factor levels and value labels. +} } \seealso{ \code{\link[=save_dictionary]{save_dictionary()}}, \code{\link[=options_sift]{options_sift()}} diff --git a/man/siftr-package.Rd b/man/siftr-package.Rd index b831e8f..b1188a5 100644 --- a/man/siftr-package.Rd +++ b/man/siftr-package.Rd @@ -6,7 +6,7 @@ \alias{siftr-package} \title{siftr: Fuzzily Search a Dataframe to Find Relevant Columns} \description{ -Analysts who change projects frequently know that it can be hard to find the right column in an unfamiliar dataframe, especially when the dataframe spans hundreds of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through a dataframe's column names, labels, factor levels, and unique values. +Analysts who change projects frequently know that it can be hard to find the right column in an unfamiliar dataframe, especially when the dataframe spans thousands of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through a dataframe's column names, labels, factor levels, and unique values. } \seealso{ Useful links: From 943610ff0887de3941a206867eeb3d364e408888 Mon Sep 17 00:00:00 2001 From: Desi Joshua Quintans Date: Mon, 15 Jul 2024 17:25:04 +1000 Subject: [PATCH 3/3] Update README.md --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 69b941c..c6961f8 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,9 @@ It was designed with medical, census, and survey data in mind, where dataframes # Installation ``` r -# Install it from CRAN -install.packages("siftr") +# CRAN soon # Or install the live development version from Github. -# Want to know what this version has compared to CRAN? See 'NEWS.md' above. remotes::install_github("DesiQuintans/siftr") ``` @@ -42,12 +40,15 @@ options(defaultPackages = c('datasets', 'utils', 'grDevices', 'graphics', 'stats # Functions in `siftr` -| Function | Description | -|:-----------------|:-----------------------------------------------------| -| `sift()` | Search through a dataframe's columns. | +| Function | Description | +|:--------------------|:-------------------------------------------------------| +| `sift()` | Search through a dataframe's columns. | +| `sift.name()` | Only search variable names (i.e. column names). | +| `sift.desc()` | Only search descriptive labels. | +| `sift.factors()` | Only search factor labels (and value labels). | | `save_dictionary()` | Save the data dictionary for use with [`tsv2label`][1] | -| `options_sift()` | Get and set options related to how `sift` functions. | -| `mtcars_lab` | A dataset bundled with the package for testing. | +| `options_sift()` | Get and set options related to how `siftr` functions. | +| `mtcars_lab` | A dataset bundled with the package for testing. |