diff --git a/DESCRIPTION b/DESCRIPTION index b73458d..8546247 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: siftr Type: Package -Title: Fuzzily Search a Dataframe to Find Relevant Columns -Version: 2.0.0 +Title: Fuzzily Search a data frame to Find Relevant Columns +Version: 2.2.0 Authors@R: c( person(given = "Desi", family = "Quintans", @@ -9,10 +9,10 @@ Authors@R: c( role = c("cre", "aut", "cph"), comment = c(ORCID = "0000-0003-3356-0293"))) Description: Analysts who change projects frequently know that it can be hard - to find the right column in an unfamiliar dataframe, especially when the - dataframe spans thousands of columns and millions of rows. 'siftr' is an + to find the right column in an unfamiliar data frame, especially when the + data frame spans thousands of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through - a dataframe's column names, labels, factor levels, and unique values. + a data frame's column names, labels, factor levels, and unique values. License: MIT + file LICENSE URL: https://github.com/DesiQuintans/siftr BugReports: https://github.com/DesiQuintans/siftr/issues diff --git a/NAMESPACE b/NAMESPACE index 5e6ee0e..56b2926 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(as.character,haven_labelled) S3method(as.double,haven_labelled) S3method(as.integer,haven_labelled) +export(last.sift) export(options_sift) export(save_dictionary) export(sift) diff --git a/R/exp_save_dictionary.R b/R/exp_save_dictionary.R index 4fb670d..523b8b5 100644 --- a/R/exp_save_dictionary.R +++ b/R/exp_save_dictionary.R @@ -34,11 +34,6 @@ #' your file explorer. #' @export #' -#' @examples -#' \donttest{ -#' save_dictionary(mtcars_lab, path = tempdir()) -#' } -#' #' @md save_dictionary <- function(df, path = stop("'path' must be specified."), ...) { df_symb <- substitute(df) @@ -62,7 +57,8 @@ save_dictionary <- function(df, path = stop("'path' must be specified."), ...) { # This rebuilds the dictionary via sift() so that the updated dictionary is # immediately available inside sift()'s closure. # https://stackoverflow.com/a/75849101/5578429 - dict <- eval(bquote(sift(.(df_symb), .rebuild = TRUE))) + + dict <- eval(bquote(sift(.(as.symbol(df_char)), .rebuild = TRUE)), envir = .siftr_env) rownames(dict) <- NULL diff --git a/R/exp_sift.R b/R/exp_sift.R index 341a54e..2501344 100644 --- a/R/exp_sift.R +++ b/R/exp_sift.R @@ -1,10 +1,8 @@ # sift() maintains its hash and dictionary lists in a closure, prooty fancy! closure.sift <- function(search_where = NULL) { - stopifnot(!is.null(search_where)) - - current_hash <- list() # Stores named hashes of dataframes. - current_dict <- list() # Stores named dataframe dictionaries. - + stopifnot( + !is.null(search_where) + ) # ---- sift() begins here ------------------------- s <- function(.df, ..., .dist = 0, .rebuild = FALSE) { @@ -12,7 +10,7 @@ closure.sift <- function(search_where = NULL) { df_name <- deparse(substitute(.df)) - # ---- 1. Ensure that df is a dataframe ------------------------------------------ + # ---- 0. Ensure that df is a dataframe ------------------------------------------ # Does it exist at all? tryCatch(is.data.frame(.df), error = function(e) { @@ -31,18 +29,26 @@ closure.sift <- function(search_where = NULL) { } - # ---- 2. Has df been sift()ed before? Has it changed since the last time? ------- + # ---- 1. Has df been sift()ed before? Has it changed since the last time? ------- + + fetched_hash <- get("saved_hash", envir = .siftr_env) + fetched_dict <- get("saved_dict", envir = .siftr_env) - new_hash <- hash_obj(df_name, current_hash) + new_hash <- hash_obj(df_name, fetched_hash) - if (.rebuild == TRUE || !identical(current_hash[[df_name]], new_hash[[df_name]])) { + if (.rebuild == TRUE || !(df_name %in% names(fetched_hash)) || !identical(fetched_hash[[df_name]], new_hash[[df_name]])) { # The new hash doesn't match the one that's stored for this object. # Update the stored hash and dictionary in the closure. - current_hash <<- new_hash - current_dict <<- build_dictionary(df_name, current_dict) - } - dict <- current_dict[[df_name]] + new_dict <- build_dictionary(df_name, fetched_dict) + + assign("saved_hash", new_hash, envir = .siftr_env) + assign("saved_dict", new_dict, envir = .siftr_env) + + dict <- new_dict[[df_name]] + } else { + dict <- fetched_dict[[df_name]] + } # ---- 3. Shortcut exit if no search is requested -------------------------------- @@ -60,7 +66,22 @@ closure.sift <- function(search_where = NULL) { " " = "{some_names}.") ) - return(invisible(dict)) + # Save details about the last query. + assign( + x = "last_query", + value = + list( + "df" = df_name, + "loc" = search_where, + "query" = ".", + "matches" = nrow(dict), + "total" = nrow(dict) + ), + envir = .siftr_env + ) + + assign("last_sift", dict, envir = .siftr_env) + return(invisible(get("last_sift", envir = .siftr_env))) } # ---- 4. If a search is needed, do one ------------------------------------------ @@ -68,10 +89,10 @@ closure.sift <- function(search_where = NULL) { search_haystack <- switch( search_where, - "all" = dict$haystack, # Searches colnames, col label, factor levels, unique values. - "name" = dict$varname, # Searches colnames only. - "desc" = dict$var_lab, # Searches col label only. - "factors" = dict$lab_lvls # Searches variable labels and factor labels. + "entirety" = dict$haystack, # Searches colnames, col label, factor levels, unique values. + "names" = dict$varname, # Searches colnames only. + "descriptions" = dict$var_lab, # Searches col label only. + "factors" = dict$lab_lvls # Searches variable labels and factor labels. ) @@ -158,14 +179,29 @@ closure.sift <- function(search_where = NULL) { } cli::cli_alert_info( - "Use {.run View(.Last.value)} to view the full table of matches." + "Use {.run siftr::last.sift()} to view the full table of matches." ) } cli::cat_line() + # Save details about the last query. + assign( + x = "last_query", + value = + list( + "df" = df_name, + "loc" = search_where, + "query" = query, + "matches" = total_results, + "total" = nrow(dict) + ), + envir = .siftr_env + ) + # Return a dataframe of all results, not just the ones that were shown. - return(invisible(dict[candidates, ])) + assign("last_sift", dict[candidates, ], envir = .siftr_env) + return(invisible(get("last_sift", envir = .siftr_env))) } return(s) @@ -241,7 +277,7 @@ closure.sift <- function(search_where = NULL) { #' } #' #' @md -sift <- closure.sift(search_where = "all") +sift <- closure.sift(search_where = "entirety") #' @describeIn sift Only search variable names (i.e. column names). @@ -250,7 +286,7 @@ sift <- closure.sift(search_where = "all") #' sift.name(mtcars_lab, "car") # Only searches variable names. #' } #' @export -sift.name <- closure.sift(search_where = "name") +sift.name <- closure.sift(search_where = "names") #' @describeIn sift Only search the descriptive labels of variables. @@ -259,7 +295,7 @@ sift.name <- closure.sift(search_where = "name") #' sift.desc(mtcars_lab, "car") # Only searches variable descriptions. #' } #' @export -sift.desc <- closure.sift(search_where = "desc") +sift.desc <- closure.sift(search_where = "descriptions") #' @describeIn sift Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types. diff --git a/R/prv_hash_obj.R b/R/prv_hash_obj.R index a8f7b16..0d72f06 100644 --- a/R/prv_hash_obj.R +++ b/R/prv_hash_obj.R @@ -45,7 +45,7 @@ # } # @md hash_obj <- function(df_name, hashlist) { - # Early dev versions of sift hashed the entire object, but this became prohibitive + # Early dev versions of sift hashed the entire object, which became prohibitive # with even modest datasets (1 GB, ~ 300 cols and 2 million rows). But the entire # dataframe doesn't really need to be hashed, we can get lots of info from hashing # only part of it. diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..3cfff6c --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,36 @@ +# Create a package-local environment +# +.siftr_env <- NULL + + +.onLoad <- function(libname, pkgname) { + # Sets up a hidden environment wherein dictionaries, hashes, etc. are stored. + # Make the parent emptyenv() so that it does not pick up other values by + # inheritance. + # + .siftr_env <<- new.env(parent = emptyenv()) + + # `saved_hash` is a named list of hashed dataframe fingerprints, where the + # names are the object names of the dataframes. + assign(x = "saved_hash", value = list(), envir = .siftr_env) + + # `saved_dict` is a named list of built dictionaries, where the names + # are the object names of the dataframes. + assign(x = "saved_dict", value = list(), envir = .siftr_env) + + # `last_sift` is the result of the last sift(), i.e. one dataframe. + # Initialised as a dataframe with 0 rows and 0 cols. + # If the last result had no matches, it can be a dataframe with 0 rows and >0 cols. + assign(x = "last_sift", value = data.frame(), envir = .siftr_env) + + # `last_query` is a named list containing the name of last dataframe sifted, + # the query that was run, what fields were searched, the number of matching + # variables that resulted, and the total number of variables in the + # dataframe. + assign(x = "last_query", value = list("df" = "", "loc" = "", "query" = "", "matches" = 0, "total" = 0), envir = .siftr_env) +} + + +.onUnload <- function(libname, pkgname) { + rm(.siftr_env) +} diff --git a/man/save_dictionary.Rd b/man/save_dictionary.Rd index c8381a7..bdeb2d4 100644 --- a/man/save_dictionary.Rd +++ b/man/save_dictionary.Rd @@ -43,9 +43,3 @@ to take advantage of \code{vroom}, or in FST to take advantage of \code{fst}. you need to relabel it every time. } } -\examples{ -\donttest{ -save_dictionary(mtcars_lab, path = tempdir()) -} - -} diff --git a/man/siftr-package.Rd b/man/siftr-package.Rd index b1188a5..736b910 100644 --- a/man/siftr-package.Rd +++ b/man/siftr-package.Rd @@ -4,9 +4,9 @@ \name{siftr-package} \alias{siftr} \alias{siftr-package} -\title{siftr: Fuzzily Search a Dataframe to Find Relevant Columns} +\title{siftr: Fuzzily Search a data frame to Find Relevant Columns} \description{ -Analysts who change projects frequently know that it can be hard to find the right column in an unfamiliar dataframe, especially when the dataframe spans thousands of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through a dataframe's column names, labels, factor levels, and unique values. +Analysts who change projects frequently know that it can be hard to find the right column in an unfamiliar data frame, especially when the data frame spans thousands of columns and millions of rows. 'siftr' is an interactive tool that finds relevant columns by fuzzily searching through a data frame's column names, labels, factor levels, and unique values. } \seealso{ Useful links: