Skip to content

Commit

Permalink
Store dictionaries etc. in environment instead of closure
Browse files Browse the repository at this point in the history
Closes #25.
  • Loading branch information
DesiQuintans committed Jul 18, 2024
1 parent d92b4ee commit 84b055c
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 43 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
Package: siftr
Type: Package
Title: Fuzzily Search a Dataframe to Find Relevant Columns
Version: 2.0.0
Title: Fuzzily Search a data frame to Find Relevant Columns
Version: 2.2.0
Authors@R: c(
person(given = "Desi",
family = "Quintans",
email = "science@desiquintans.com",
role = c("cre", "aut", "cph"),
comment = c(ORCID = "0000-0003-3356-0293")))
Description: Analysts who change projects frequently know that it can be hard
to find the right column in an unfamiliar dataframe, especially when the
dataframe spans thousands of columns and millions of rows. 'siftr' is an
to find the right column in an unfamiliar data frame, especially when the
data frame spans thousands of columns and millions of rows. 'siftr' is an
interactive tool that finds relevant columns by fuzzily searching through
a dataframe's column names, labels, factor levels, and unique values.
a data frame's column names, labels, factor levels, and unique values.
License: MIT + file LICENSE
URL: https://github.com/DesiQuintans/siftr
BugReports: https://github.com/DesiQuintans/siftr/issues
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
S3method(as.character,haven_labelled)
S3method(as.double,haven_labelled)
S3method(as.integer,haven_labelled)
export(last.sift)
export(options_sift)
export(save_dictionary)
export(sift)
Expand Down
8 changes: 2 additions & 6 deletions R/exp_save_dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,6 @@
#' your file explorer.
#' @export
#'
#' @examples
#' \donttest{
#' save_dictionary(mtcars_lab, path = tempdir())
#' }
#'
#' @md
save_dictionary <- function(df, path = stop("'path' must be specified."), ...) {
df_symb <- substitute(df)
Expand All @@ -62,7 +57,8 @@ save_dictionary <- function(df, path = stop("'path' must be specified."), ...) {
# This rebuilds the dictionary via sift() so that the updated dictionary is
# immediately available inside sift()'s closure.
# https://stackoverflow.com/a/75849101/5578429
dict <- eval(bquote(sift(.(df_symb), .rebuild = TRUE)))

dict <- eval(bquote(sift(.(as.symbol(df_char)), .rebuild = TRUE)), envir = .siftr_env)
rownames(dict) <- NULL


Expand Down
82 changes: 59 additions & 23 deletions R/exp_sift.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# sift() maintains its hash and dictionary lists in a closure, prooty fancy!
closure.sift <- function(search_where = NULL) {
stopifnot(!is.null(search_where))

current_hash <- list() # Stores named hashes of dataframes.
current_dict <- list() # Stores named dataframe dictionaries.

stopifnot(
!is.null(search_where)
)

# ---- sift() begins here -------------------------
s <- function(.df, ..., .dist = 0, .rebuild = FALSE) {
# The df is passed to internal functions as a Char string, then evaluated later.
df_name <- deparse(substitute(.df))


# ---- 1. Ensure that df is a dataframe ------------------------------------------
# ---- 0. Ensure that df is a dataframe ------------------------------------------

# Does it exist at all?
tryCatch(is.data.frame(.df), error = function(e) {
Expand All @@ -31,18 +29,26 @@ closure.sift <- function(search_where = NULL) {
}


# ---- 2. Has df been sift()ed before? Has it changed since the last time? -------
# ---- 1. Has df been sift()ed before? Has it changed since the last time? -------

fetched_hash <- get("saved_hash", envir = .siftr_env)
fetched_dict <- get("saved_dict", envir = .siftr_env)

new_hash <- hash_obj(df_name, current_hash)
new_hash <- hash_obj(df_name, fetched_hash)

if (.rebuild == TRUE || !identical(current_hash[[df_name]], new_hash[[df_name]])) {
if (.rebuild == TRUE || !(df_name %in% names(fetched_hash)) || !identical(fetched_hash[[df_name]], new_hash[[df_name]])) {
# The new hash doesn't match the one that's stored for this object.
# Update the stored hash and dictionary in the closure.
current_hash <<- new_hash
current_dict <<- build_dictionary(df_name, current_dict)
}

dict <- current_dict[[df_name]]
new_dict <- build_dictionary(df_name, fetched_dict)

assign("saved_hash", new_hash, envir = .siftr_env)
assign("saved_dict", new_dict, envir = .siftr_env)

dict <- new_dict[[df_name]]
} else {
dict <- fetched_dict[[df_name]]
}


# ---- 3. Shortcut exit if no search is requested --------------------------------
Expand All @@ -60,18 +66,33 @@ closure.sift <- function(search_where = NULL) {
" " = "{some_names}.")
)

return(invisible(dict))
# Save details about the last query.
assign(
x = "last_query",
value =
list(
"df" = df_name,
"loc" = search_where,
"query" = ".",
"matches" = nrow(dict),
"total" = nrow(dict)
),
envir = .siftr_env
)

assign("last_sift", dict, envir = .siftr_env)
return(invisible(get("last_sift", envir = .siftr_env)))
}

# ---- 4. If a search is needed, do one ------------------------------------------

search_haystack <-
switch(
search_where,
"all" = dict$haystack, # Searches colnames, col label, factor levels, unique values.
"name" = dict$varname, # Searches colnames only.
"desc" = dict$var_lab, # Searches col label only.
"factors" = dict$lab_lvls # Searches variable labels and factor labels.
"entirety" = dict$haystack, # Searches colnames, col label, factor levels, unique values.
"names" = dict$varname, # Searches colnames only.
"descriptions" = dict$var_lab, # Searches col label only.
"factors" = dict$lab_lvls # Searches variable labels and factor labels.
)


Expand Down Expand Up @@ -158,14 +179,29 @@ closure.sift <- function(search_where = NULL) {
}

cli::cli_alert_info(
"Use {.run View(.Last.value)} to view the full table of matches."
"Use {.run siftr::last.sift()} to view the full table of matches."
)
}

cli::cat_line()

# Save details about the last query.
assign(
x = "last_query",
value =
list(
"df" = df_name,
"loc" = search_where,
"query" = query,
"matches" = total_results,
"total" = nrow(dict)
),
envir = .siftr_env
)

# Return a dataframe of all results, not just the ones that were shown.
return(invisible(dict[candidates, ]))
assign("last_sift", dict[candidates, ], envir = .siftr_env)
return(invisible(get("last_sift", envir = .siftr_env)))
}

return(s)
Expand Down Expand Up @@ -241,7 +277,7 @@ closure.sift <- function(search_where = NULL) {
#' }
#'
#' @md
sift <- closure.sift(search_where = "all")
sift <- closure.sift(search_where = "entirety")


#' @describeIn sift Only search variable names (i.e. column names).
Expand All @@ -250,7 +286,7 @@ sift <- closure.sift(search_where = "all")
#' sift.name(mtcars_lab, "car") # Only searches variable names.
#' }
#' @export
sift.name <- closure.sift(search_where = "name")
sift.name <- closure.sift(search_where = "names")


#' @describeIn sift Only search the descriptive labels of variables.
Expand All @@ -259,7 +295,7 @@ sift.name <- closure.sift(search_where = "name")
#' sift.desc(mtcars_lab, "car") # Only searches variable descriptions.
#' }
#' @export
sift.desc <- closure.sift(search_where = "desc")
sift.desc <- closure.sift(search_where = "descriptions")


#' @describeIn sift Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types.
Expand Down
2 changes: 1 addition & 1 deletion R/prv_hash_obj.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
# }
# @md
hash_obj <- function(df_name, hashlist) {
# Early dev versions of sift hashed the entire object, but this became prohibitive
# Early dev versions of sift hashed the entire object, which became prohibitive
# with even modest datasets (1 GB, ~ 300 cols and 2 million rows). But the entire
# dataframe doesn't really need to be hashed, we can get lots of info from hashing
# only part of it.
Expand Down
36 changes: 36 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Create a package-local environment
# <https://stackoverflow.com/a/55622386/5578429>
.siftr_env <- NULL


.onLoad <- function(libname, pkgname) {
# Sets up a hidden environment wherein dictionaries, hashes, etc. are stored.
# Make the parent emptyenv() so that it does not pick up other values by
# inheritance.
# <https://stackoverflow.com/questions/12598242/global-variables-in-packages-in-r/12605694#comment66987725_12605694>
.siftr_env <<- new.env(parent = emptyenv())

# `saved_hash` is a named list of hashed dataframe fingerprints, where the
# names are the object names of the dataframes.
assign(x = "saved_hash", value = list(), envir = .siftr_env)

# `saved_dict` is a named list of built dictionaries, where the names
# are the object names of the dataframes.
assign(x = "saved_dict", value = list(), envir = .siftr_env)

# `last_sift` is the result of the last sift(), i.e. one dataframe.
# Initialised as a dataframe with 0 rows and 0 cols.
# If the last result had no matches, it can be a dataframe with 0 rows and >0 cols.
assign(x = "last_sift", value = data.frame(), envir = .siftr_env)

# `last_query` is a named list containing the name of last dataframe sifted,
# the query that was run, what fields were searched, the number of matching
# variables that resulted, and the total number of variables in the
# dataframe.
assign(x = "last_query", value = list("df" = "", "loc" = "", "query" = "", "matches" = 0, "total" = 0), envir = .siftr_env)
}


.onUnload <- function(libname, pkgname) {
rm(.siftr_env)
}
6 changes: 0 additions & 6 deletions man/save_dictionary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/siftr-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 84b055c

Please sign in to comment.