Skip to content

Commit

Permalink
Merge pull request #24 from DesiQuintans/new_verbs
Browse files Browse the repository at this point in the history
Add new verbs to sift()
  • Loading branch information
DesiQuintans authored Jul 15, 2024
2 parents d03152d + 943610f commit d92b4ee
Show file tree
Hide file tree
Showing 17 changed files with 203 additions and 195 deletions.
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: siftr
Type: Package
Title: Fuzzily Search a Dataframe to Find Relevant Columns
Version: 1.1.1
Version: 2.0.0
Authors@R: c(
person(given = "Desi",
family = "Quintans",
Expand All @@ -10,7 +10,7 @@ Authors@R: c(
comment = c(ORCID = "0000-0003-3356-0293")))
Description: Analysts who change projects frequently know that it can be hard
to find the right column in an unfamiliar dataframe, especially when the
dataframe spans hundreds of columns and millions of rows. 'siftr' is an
dataframe spans thousands of columns and millions of rows. 'siftr' is an
interactive tool that finds relevant columns by fuzzily searching through
a dataframe's column names, labels, factor levels, and unique values.
License: MIT + file LICENSE
Expand All @@ -23,4 +23,6 @@ Depends:
Imports:
cli,
fastdigest
RoxygenNote: 7.2.3
Suggests:
rlang
RoxygenNote: 7.3.2
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ S3method(as.integer,haven_labelled)
export(options_sift)
export(save_dictionary)
export(sift)
export(sift.desc)
export(sift.factors)
export(sift.name)
2 changes: 1 addition & 1 deletion R/exp_data_mtcars_lab.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@

# ---- This documents mtcars_lab -----------------------------------------------

#' Labelled version of mtcars for testing `siftr`
#' Labelled version of `mtcars` for testing `siftr`
#'
#' This is `mtcars` with value labels, variable labels (in `vs` only), some transformation
#' to factor (`car` and `am`), an added Logical column (`above_avg`), an added column
Expand Down
7 changes: 4 additions & 3 deletions R/exp_options_sift.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ options_sift <- function(key = c("sift_limit", "sift_guessmax", "sift_peeklength
}

if ((key %in% names(default_setting)) == FALSE) {
cli::cli_abort(c(
"x" = msg_sift("not option", 1, key),
"i" = msg_sift("not option", 2, fold_or(names(default_setting)))
cli::cli_abort(
message = c(
"x" = "{.val {key}} is not one of {.pkg siftr}'s options.",
"i" = "Accepted options are: {.val {names(default_setting)}}."
))
}

Expand Down
2 changes: 1 addition & 1 deletion R/exp_save_dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#'
#' @examples
#' \donttest{
#' save_dictionary(CO2, path = tempdir())
#' save_dictionary(mtcars_lab, path = tempdir())
#' }
#'
#' @md
Expand Down
142 changes: 107 additions & 35 deletions R/exp_sift.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,33 @@
# sift() maintains its hash and dictionary lists in a closure, prooty fancy!
closure.sift <- function() {
closure.sift <- function(search_where = NULL) {
stopifnot(!is.null(search_where))

current_hash <- list() # Stores named hashes of dataframes.
current_dict <- list() # Stores named dataframe dictionaries.



# ---- sift() begins here -------------------------
s <- function(.df, ..., .dist = 0, .rebuild = FALSE) {
# The df is passed to internal functions as a Char string, then evaluated later.
df_name <- deparse(substitute(.df))


# ---- 1. Ensure that df is a dataframe ------------------------------------------

# Does it exist at all?
tryCatch(is.data.frame(.df), error = function(e) {
cli::cli_abort(
message = c("No object named {.var {df_name}} was found.",
" " = "Does it exist? Is its name correct?"),
call = NULL)
})

# Is it a dataframe?
if (!is.data.frame(.df)) {
cli::cli_alert_danger( msg_sift("not a df", 1, df_name))
cli::cli_alert_warning(msg_sift("not a df", 2))
cli::cat_line()
cli::cli_abort(
message = c("{.var {df_name}} is not a dataframe.",
" " = "{.pkg siftr} only searches through dataframes."),
call = NULL)
}


Expand All @@ -33,44 +45,64 @@ closure.sift <- function() {
dict <- current_dict[[df_name]]


# ---- 2. Convert ... into a query and perform a search --------------------------
# ---- 3. Shortcut exit if no search is requested --------------------------------

orig_query <- nse_dots(...)
orig_query <- nse_dots(...)

if (identical(orig_query, character(0))) {
# If dots is empty, then return the dictionary itself.

cli::cli_alert_info(msg_sift("report dims", 1,
df_name,
length(unique(dict[["varname"]])),
fold_middle(dict[["varname"]], n = 50)),
wrap = TRUE)
cli::cat_line()
# Report some of the variables in the dataframe.
num_cols <- length(unique(dict[["varname"]]))
some_names <- fold_middle(dict[["varname"]], n = 50)
cli::cli_inform(
message = c("i" = "{.var {df_name}} has {num_cols} column{?s}:",
" " = "{some_names}.")
)

return(invisible(dict))
} else if (length(orig_query) == 1) {
}

# ---- 4. If a search is needed, do one ------------------------------------------

search_haystack <-
switch(
search_where,
"all" = dict$haystack, # Searches colnames, col label, factor levels, unique values.
"name" = dict$varname, # Searches colnames only.
"desc" = dict$var_lab, # Searches col label only.
"factors" = dict$lab_lvls # Searches variable labels and factor labels.
)


if (length(orig_query) == 1) {
# If dots has just one item in it, then treat it as an agrep() search, which
# is possibly a regular expression.

query <- orig_query

candidates <-
agrep(query, dict$haystack, ignore.case = TRUE, value = FALSE,
agrep(query, search_haystack, ignore.case = TRUE, value = FALSE,
fixed = FALSE, max.distance = .dist)
} else {
# But if dots has more than one element, then use it to build a fuzzy search
# with look-around.

if (.dist > 0) {
cli::cli_warn(c("!" = msg_sift("dist_ignore", 1, .dist),
"i" = msg_sift("dist_ignore", 2)))

cli::cli_inform(
message = c(
"An orderless search was performed, so {.arg .dist = {(.dist)}} was ignored.",
"To remove this warning, either remove {.arg .dist} or provide your query as a single character string."
)
)
}

query <- fuzzy_needle(orig_query) # E.g. (?=.*gallon)(?=.*mileage)

candidates <-
# Fuzzy needle requires PERL regex, which agrep and aregexc don't support.
grep(query, dict$haystack, ignore.case = TRUE, perl = TRUE)
grep(query, search_haystack, ignore.case = TRUE, perl = TRUE)
}


Expand All @@ -89,38 +121,49 @@ closure.sift <- function() {
# If there are no matches (integer(0)), this returns a dataframe with no rows.
found <- dict[shown_candidates, ]

cli::cli_h1("Results for {.code {query}} in {.pkg {df_name}}")

if (length(shown_candidates) >= 1) {
# There's at least one match. I use display_row() to print every row of the return dataframe.
apply(X = found, MARGIN = 1, FUN = display_row)
}

cli::cli_h2("Summary")

if (length(shown_candidates) < 1) {
# No matches.
cli::cli_alert_danger(msg_sift("no matches", 1, query, .dist))

cli::cli_alert_danger("No matches found for query {.code {query}}.", wrap = TRUE)

if (grepl("\`", query)) {
# Backtick was found. This can happen if a regex was passed as a bare name.
cli::cli_alert_info(msg_sift("no matches", 2), wrap = TRUE)
cli::cli_alert_info("If you're using a regular expression, pass it as a string.", wrap = TRUE)
}

if (length(orig_query) == 1) {
cli::cli_alert_info(msg_sift("no matches", 3, .dist), wrap = TRUE)
cli::cli_alert_info("Try increasing {.arg .dist = {(.dist)}} to allow more distant matches.", wrap = TRUE)
}

cli::cat_line()
} else {
# There's at least one match. I use display_row() to print every row of the return dataframe.
apply(X = found, MARGIN = 1, FUN = display_row)
# `n results` = c("There %s %i result%s for query `%s`."),
#
# `over limit` = c("Only %1$s of them %2$s printed, set by options_sift(\"sift_limit\", %1$s)"),

plur <- plural(length(shown_candidates))
cli::cat_line()
cli::cli_alert_success(msg_sift("n results", 1,
plur$were,
total_results,
plur$s, query))
cli::cli_alert_success("Found {total_results} result{?s} for query {.code {query}}.", wrap = TRUE)

if (excess_results > 0) {
cli::cli_alert_warning(msg_sift("over limit", 1,
options_sift("sift_limit"),
plural(excess_results)$was))
cli::cli_alert_warning(
"Only {length(shown_candidates)} of them {?was/were} printed, set by {.run options_sift(\"sift_limit\", {options_sift('sift_limit')})}.",
wrap = TRUE
)
}

cli::cli_alert_info(
"Use {.run View(.Last.value)} to view the full table of matches."
)
}

cli::cat_line()

# Return a dataframe of all results, not just the ones that were shown.
return(invisible(dict[candidates, ]))
}
Expand Down Expand Up @@ -176,6 +219,8 @@ closure.sift <- function() {
#' - If the query was matched, only returns matching rows of the data dictionary.
#' - If the query was not matched, return no rows of the dictionary (but all columns).
#'
#' @describeIn sift Search variable names, descriptive labels, factor labels, and unique values.
#'
#' @seealso [siftr::save_dictionary()], [siftr::options_sift()]
#'
#' @export
Expand All @@ -196,4 +241,31 @@ closure.sift <- function() {
#' }
#'
#' @md
sift <- closure.sift()
sift <- closure.sift(search_where = "all")


#' @describeIn sift Only search variable names (i.e. column names).
#' @examples
#' \donttest{
#' sift.name(mtcars_lab, "car") # Only searches variable names.
#' }
#' @export
sift.name <- closure.sift(search_where = "name")


#' @describeIn sift Only search the descriptive labels of variables.
#' @examples
#' \donttest{
#' sift.desc(mtcars_lab, "car") # Only searches variable descriptions.
#' }
#' @export
sift.desc <- closure.sift(search_where = "desc")


#' @describeIn sift Only search factor labels. This includes "value labels", e.g. 'haven_labelled' types.
#' @examples
#' \donttest{
#' sift.factors(mtcars_lab, "manual") # Only searches factor levels and value labels.
#' }
#' @export
sift.factors <- closure.sift(search_where = "factors")
29 changes: 23 additions & 6 deletions R/prv_build_dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ build_dictionary <- function(DF, dictlist) {
df_name <- DF
DF <- eval(as.symbol(DF))

cli::cli_alert_info(msg_sift("building", 1, df_name), wrap = TRUE)
cli::cli_inform(
message = c("i" = "Building the dictionary for {.var {df_name}}...")
)

start_time <- Sys.time()

Expand All @@ -19,6 +21,16 @@ build_dictionary <- function(DF, dictlist) {
raw_var_labs <- sapply(DF, attr, "label")
raw_val_labs <- sapply(DF, function(col) { names(attr(col, "labels")) }) # The names are what I want.
raw_fct_lvls <- sapply(DF, levels)
# `raw_lab_lvls` combines the value labels and factor levels into one thing. With SAS datasets, for example,
# `haven` imports factors as labelled variables and not as proper factors.
raw_lab_lvls <- mapply(
function(x, y) {
result <- unique(c(x, y))
result[is.null(result)] <- ""
return(result)
},
raw_fct_lvls, raw_val_labs, SIMPLIFY = FALSE)


# Extra details for the data dictionary
dct_type_strs <- sapply(DF, coltype)
Expand All @@ -30,11 +42,12 @@ build_dictionary <- function(DF, dictlist) {
dct_all_same <- sapply(DF, invariant)

# Getting labels into vectors of length 1.
var_labs <- esc_braces(crunch(raw_var_labs))
val_labs <- esc_braces(crunch(raw_val_labs))
fct_lvls <- esc_braces(crunch(raw_fct_lvls))
var_labs <- esc_braces(crunch(raw_var_labs))
val_labs <- esc_braces(crunch(raw_val_labs))
fct_lvls <- esc_braces(crunch(raw_fct_lvls))
labs_lvls <- esc_braces(crunch(raw_lab_lvls))
# Those labels and unique values joined together to make searchable strings.
haystacks <- smash(raw_varnames, var_labs, val_labs, fct_lvls, dct_rand_uniq)
haystacks <- smash(raw_varnames, var_labs, labs_lvls, dct_rand_uniq)

dictionary <-
data.frame(
Expand All @@ -48,7 +61,9 @@ build_dictionary <- function(DF, dictlist) {
all_same = dct_all_same,
val_lab = codify(raw_val_labs),
fct_lvl = codify(raw_fct_lvls),
labs_lvls = labs_lvls,
fct_ordered = dct_ordered,

class = codify(dct_classes),
type = codify(dct_types),
haystack = haystacks,
Expand All @@ -61,7 +76,9 @@ build_dictionary <- function(DF, dictlist) {
elapsed <- round(end_time - start_time, digits = 2)
elapsed_str <- paste(elapsed, attr(elapsed, "units"))

cli::cli_alert_success(msg_sift("built", 1, elapsed_str))
cli::cli_inform(
message = c("v" = "Dictionary was built in {elapsed_str}."),
)
cli::cat_line()

return(invisible(dictlist))
Expand Down
39 changes: 0 additions & 39 deletions R/prv_msg_sift.R

This file was deleted.

Loading

0 comments on commit d92b4ee

Please sign in to comment.