KWB-R · hsonne · Mar 28, 2024 · Apr 21, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: kwb.utils
 Title: General Utility Functions Developed at KWB
-Version: 0.14.1
+Version: 0.15.0
 Authors@R: 
     c(person(given = "Hauke",
              family = "Sonnenberg",
@@ -20,6 +20,8 @@ Description: This package contains some small helper
 License: MIT + file LICENSE
 URL: https://github.com/kwb-r/kwb.utils
 BugReports: https://github.com/kwb-r/kwb.utils/issues
+Imports:
+    methods
 Suggests:
     PKI,
     testthat,
@@ -30,4 +32,4 @@ Suggests:
 VignetteBuilder: 
     knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ export(atLeastOneRowIn)
 export(backspace)
 export(breakInSequence)
 export(callWith)
+export(callWithData)
 export(callWithStringsAsFactors)
 export(catAndRun)
 export(catChanges)
@@ -170,6 +171,7 @@ export(limitToRange)
 export(linearCombination)
 export(listObjects)
 export(listToDepth)
+export(loadFunctions)
 export(loadObject)
 export(mainClass)
 export(makeUnique)
@@ -302,4 +304,5 @@ export(warningDeprecated)
 export(windowsPath)
 export(writeDictionary)
 export(writeText)
+importFrom(methods,formalArgs)
 importFrom(utils,capture.output)
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,21 @@
-# [kwb.utils 0.14.1](https://github.com/KWB-R/kwb.utils/releases/tag/v0.14.0) <small>2023-04-21</small>
+# [kwb.utils 0.15.0](https://github.com/KWB-R/kwb.utils/releases/tag/v0.15.0) <small>2024-03-28</small>
+
+- add loadFunctions() with private function
+  - mergeEnvironments()
+- add callWithData() with private functions 
+  - splitIntoIdenticalRows(), 
+  - rbindFirstRows(), 
+  - seqAlongRows(), 
+  - nDims(), 
+  - expandToVector()
+- multiColumnLookup()
+  - allow to lookup more than one value column
+  - add arguments "drop", "includeKeys" 
+- findPartialDuplicates()
+  - reimplement the function in order to fix a :bug:
+- use catAndRun() within sourceScripts()
+
+# [kwb.utils 0.14.1](https://github.com/KWB-R/kwb.utils/releases/tag/v0.14.1) <small>2023-04-21</small>
 
 - Fix bug in listToDepth(): pass ... arguments to FUN also if pattern is NULL
 - Add argument "silent" to private function listFiles()

diff --git a/R/callWithData.R b/R/callWithData.R
@@ -0,0 +1,137 @@
+# callWithData -----------------------------------------------------------------
+
+#' Call a Function with Argument Combinations from a Data Frame
+#'
+#' @param FUN function to be called
+#' @param data data frame with one column per argument of \code{FUN}
+#' @param \dots further (constant) arguments to \code{FUN} that are passed to
+#'   \code{\link{mapply}} via \code{MoreArgs}
+#' @param threshold if the ratio of unique value combinations in the relevant
+#'   columns in data to all value combinations in these columns is below this
+#'   threshold value then FUN will be called only with the unique value
+#'   combinations. This should increase performance.
+#' @param SIMPLIFY passed to\code{\link{mapply}}, default: \code{TRUE}
+#' @param USE.NAMES passed to\code{\link{mapply}}, default: \code{TRUE}
+#' @return vector of length \code{nrow(data)} with the result values returned by
+#'   \code{FUN}
+#' @importFrom methods formalArgs
+#' @export
+#' @examples
+#' combis <- expand.grid(x = 1:2, y = c(10, 20, 30))
+#' combis
+#'
+#' callWithData(`+`, combis)
+callWithData <- function(
+    FUN,
+    data,
+    ...,
+    threshold = 0.5,
+    SIMPLIFY = TRUE,
+    USE.NAMES = TRUE
+)
+{
+  # What arguments does FUN have?
+  all_args <- methods::formalArgs(FUN)
+
+  # Select the columns from data that are arguments of FUN
+  arg_data <- selectColumns(data, intersect(all_args, names(data)))
+
+  # Split arg_data into sets of identical rows
+  sets <- splitIntoIdenticalRows(arg_data)
+
+  # Number of all value combinations
+  n_all <- nrow(arg_data)
+
+  # Number of unique value combinations
+  n_unique <- length(sets)
+
+  # Should we run FUN only for the unique value combinations?
+  run_unique <- (n_unique / n_all < threshold)
+
+  # Name of the function to be called
+  fun_name <- deparse(substitute(FUN))
+
+  # Run FUN for each row of run_data
+  results <- catAndRun(
+    messageText = if (run_unique) {
+      sprintf(
+        "-> Calling %s() for %d unique value combinations",
+        fun_name, n_unique
+      )
+    } else {
+      sprintf(
+        "-> Calling %s() for all %d value combinations",
+        fun_name, n_all
+      )
+    },
+    expr = {
+
+      more_args <- list(...)
+
+      mapply_args_fix <- list(
+        FUN = FUN,
+        MoreArgs = if (length(more_args)) more_args,
+        SIMPLIFY = SIMPLIFY,
+        USE.NAMES = USE.NAMES
+      )
+
+      mapply_args_var <- if (run_unique) {
+        removeColumns(rbindFirstRows(sets), "row.")
+      } else {
+        arg_data
+      }
+
+      do.call(mapply, c(mapply_args_fix, mapply_args_var))
+    }
+  )
+
+  if (!run_unique) {
+    return(results)
+  }
+
+  catAndRun(
+    "-> Expanding the results to the extent of the input",
+    expandToVector(
+      x = results,
+      indices = lapply(sets, selectColumns, "row.")
+    )
+  )
+}
+
+# splitIntoIdenticalRows -------------------------------------------------------
+splitIntoIdenticalRows <- function(data)
+{
+  split(cbind(data, row. = seqAlongRows(data)), f = data, drop = TRUE)
+}
+
+# seqAlongRows ---------------------------------------------------------------
+seqAlongRows <- function(data)
+{
+  seq_len(nrow(data))
+}
+
+# rbindFirstRows ---------------------------------------------------------------
+rbindFirstRows <- function(x)
+{
+  stopifnot(is.list(x), all(sapply(x, nDims) == 2L))
+
+  resetRowNames(do.call(rbind, lapply(x, utils::head, 1L)))
+}
+
+# nDims: number of dimensions --------------------------------------------------
+nDims <- function(x)
+{
+  length(dim(x))
+}
+
+# expandToVector ---------------------------------------------------------------
+expandToVector <- function(x, indices)
+{
+  stopifnot(length(x) == length(indices))
+
+  result <- list()
+
+  result[unlist(indices)] <- rep(x, lengths(indices))
+
+  result
+}
diff --git a/R/dataFrame.R b/R/dataFrame.R
@@ -483,9 +483,8 @@ rbindAll <- function(
 #'
 resetRowNames <- function(x)
 {
-  if (length(dim(x)) != 2) {
-
-    stop(deparse(substitute(x)), " must be have two dimensions", call. = FALSE)
+  if (length(dim(x)) != 2L) {
+    stop(deparse(substitute(x)), " must have two dimensions", call. = FALSE)
   }
 
   row.names(x) <- NULL

diff --git a/R/findPartialDuplicates.R b/R/findPartialDuplicates.R
@@ -31,38 +31,27 @@
 #'
 findPartialDuplicates <- function(data, key_columns, skip_columns = NULL)
 {
-  keys <- selectColumns(data, key_columns, drop = FALSE)
-
-  # Columns to be considered in duplicate check
-  columns <- setdiff(names(data), skip_columns)
+  row_sets <- unname(split(
+    x = seq_len(nrow(data)), 
+    f = selectColumns(data, key_columns, drop = FALSE), 
+    drop = TRUE
+  ))
 
-  # Are the rows identical in all columns (except those in skip_columns)?
-  is_row_duplicate <- duplicated(data[, columns])
-
-  # Are the keys (combination of key values) duplicated?
-  is_key_duplicate <- duplicated(keys)
+  row_sets <- row_sets[lengths(row_sets) > 1L]
 
-  # Are the rows duplicated in the key columns but not fully duplicated rows?
-  is_partial_duplicate <- is_key_duplicate & ! is_row_duplicate
-
-  # For any partial duplicate, extract the differing values
-  if (any(is_partial_duplicate)) {
-
-    lapply(which(is_partial_duplicate), function(index) {
-
-      # Get the value(s) of the (composed) key
-      values <- keys[index, , drop = FALSE]
-
-      # Which rows are matching in all key columns?
-      matches <- sapply(names(values), function(x) values[[x]] == keys[[x]])
-      match_indices <- which(rowSums(matches) == length(values))
-
-      # Select the corresponding rows
-      result <- data[match_indices, , drop = FALSE]
-
-      # Select only columns in which values are actually differing
-      columns <- c(key_columns, names(result)[! sapply(result, allAreEqual)])
-      result[, columns, drop = FALSE]
-    })
+  if (length(row_sets) == 0L) {
+    return(NULL)
   }
+
+  row_sets <- row_sets[order(sapply(row_sets, "[", 1L))]
+
+  columns_to_consider <- setdiff(names(data), skip_columns)
+
+  result <- lapply(row_sets, function(rows) {
+    y <- data[rows, columns_to_consider]
+    differing_columns <- names(which(!sapply(y, kwb.utils::allAreEqual)))
+    y[, c(key_columns, differing_columns), drop = FALSE]
+  })
+
+  stats::setNames(result, NULL)
 }
diff --git a/R/loadFunctions.R b/R/loadFunctions.R
@@ -0,0 +1,85 @@
+#' Load Functions from Scripts into Attached Namespace
+#' 
+#' This function parses R scripts at the given paths and provides all objects in
+#' an environment that is attached to the search path.
+#' 
+#' @param paths vector of character with paths to files (R scripts) or 
+#'   directories (in which to look for R scripts).
+#' @param attach if \code{TRUE} the environment with all parsed objects is
+#'   attached to the search path, otherwise the environment is returned
+#' @param name name that is used when attaching the environment with all parsed
+#'   objects to the search path. Default: kwb
+#' @param dbg logical indicating whether or not to show debug messages
+#' @return if \code{attach} is \code{FALSE} the environment with all parsed 
+#'   objects is returned, otherwise \code{NULL}, invisibly.
+#' @export
+loadFunctions <- function(paths, attach = TRUE, name = "kwb", dbg = TRUE)
+{
+  # Which paths refer to directories?
+  isDirectory <- file.info(paths)[, "isdir"]
+
+  # List all R script files
+  files <- dir(paths[isDirectory], full.names = TRUE, "\\.R$")
+
+  # Add the paths that refer to (script) files
+  files <- c(files, paths[!isDirectory])
+
+  # Create a new environment in which to store all functions
+  resultEnv <- new.env()
+
+  for (file in files) {
+
+    catAndRun(
+      paste("Loading functions from", file), 
+      dbg = dbg,
+      expr = {
+
+        # Create a new environment in which to store the functions of the script
+        scriptEnv <- new.env()
+
+        # Load the functions defined in the file into the script environment
+        source(file, local = scriptEnv)
+
+        # Check for duplicated names in result and script environment
+        duplicates <- intersect(ls(scriptEnv), ls(resultEnv))
+
+        # Stop if there are duplicates
+        if (length(duplicates)) {
+          stopFormatted(
+            "The function(s) are already defined: %s", 
+            stringList(duplicates)
+          )
+        }
+
+        # Copy all elements from script environment to result environment
+        resultEnv <- mergeEnvironments(resultEnv, scriptEnv)
+      }
+    )
+  }
+
+  if (!attach) {
+    return(resultEnv)
+  }
+
+  # Detach the environment if it is already attached
+  if (name %in% search()) {
+    detach(name, character.only = TRUE)
+  }
+
+  # Attach the environment
+  attach(resultEnv, name = name)
+}
+
+# mergeEnvironments ------------------------------------------------------------
+
+# Helper function to copy the elements of one environment to another
+mergeEnvironments <- function(targetEnv, sourceEnv)
+{
+  # Assign all elements of the source environment in the target environment
+  for (name in ls(sourceEnv, all.names = TRUE)) {
+    assign(name, get(name, sourceEnv), targetEnv)
+  }
+
+  # Return the target environment
+  targetEnv
+}