mlr-org
diff --git a/‎.github/workflows/pkgdown.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pkgdown.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎DESCRIPTION
Lines changed: 2 additions & 1 deletion b/‎DESCRIPTION
Lines changed: 2 additions & 1 deletion
diff --git a/‎NAMESPACE
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE
Lines changed: 1 addition & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 2 additions & 0 deletions b/‎NEWS.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/PipeOpDecode.R
Lines changed: 268 additions & 0 deletions b/‎R/PipeOpDecode.R
Lines changed: 268 additions & 0 deletions
diff --git a/‎R/PipeOpKernelPCA.R
Lines changed: 2 additions & 2 deletions b/‎R/PipeOpKernelPCA.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/po.R
Lines changed: 2 additions & 2 deletions b/‎R/po.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/ppl.R
Lines changed: 2 additions & 2 deletions b/‎R/ppl.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎man/PipeOp.Rd
Lines changed: 1 addition & 0 deletions b/‎man/PipeOp.Rd
Lines changed: 1 addition & 0 deletions
diff --git a/‎man/PipeOpEnsemble.Rd
Lines changed: 1 addition & 0 deletions b/‎man/PipeOpEnsemble.Rd
Lines changed: 1 addition & 0 deletions
diff --git a/‎man/PipeOpImpute.Rd
Lines changed: 1 addition & 0 deletions b/‎man/PipeOpImpute.Rd
Lines changed: 1 addition & 0 deletions
diff --git a/‎man/PipeOpTargetTrafo.Rd
Lines changed: 1 addition & 0 deletions b/‎man/PipeOpTargetTrafo.Rd
Lines changed: 1 addition & 0 deletions
diff --git a/‎man/PipeOpTaskPreproc.Rd
Lines changed: 1 addition & 0 deletions b/‎man/PipeOpTaskPreproc.Rd
Lines changed: 1 addition & 0 deletions
@@ -44,7 +44,7 @@ jobs:
 
       - name: Deploy
         if: github.event_name != 'pull_request'
-        uses: JamesIves/github-pages-deploy-action@v4.6.9
+        uses: JamesIves/github-pages-deploy-action@v4.7.2
         with:
           clean: false
           branch: gh-pages
 
@@ -66,7 +66,7 @@ Imports:
     digest,
     lgr,
     mlr3 (>= 0.20.0),
-    mlr3misc (>= 0.9.0),
+    mlr3misc (>= 0.16.0),
     paradox,
     R6,
     withr
@@ -141,6 +141,7 @@ Collate:
     'PipeOpCollapseFactors.R'
     'PipeOpCopy.R'
     'PipeOpDateFeatures.R'
+    'PipeOpDecode.R'
     'PipeOpEncode.R'
     'PipeOpEncodeImpact.R'
     'PipeOpEncodeLmer.R'
 
@@ -108,6 +108,7 @@ export(PipeOpColRoles)
 export(PipeOpCollapseFactors)
 export(PipeOpCopy)
 export(PipeOpDateFeatures)
+export(PipeOpDecode)
 export(PipeOpEncode)
 export(PipeOpEncodeImpact)
 export(PipeOpEncodeLmer)
 
@@ -5,6 +5,8 @@
 * Fix: `LearnerClassifAvg` and `LearnerRegrAvg` hyperparameters get the `"required"` tag.
 * New parameter `use_groups` (default `TRUE`) for `PipeOpSubsampling` to respect grouping (changed default behaviour for grouped data)
 * New parameter `new_role_direct` for `PipeOpColRoles` / `po("colroles")` to change column roles by role instead of by column.
+* Dictionary sugar functions `po()` / `pos()` / `ppl()` / `ppls()` now make suggestions for entries in both `mlr_pipeops` as well as `mlr_graphs` when an object by the given name could not be found in the respective dictionary.
+* New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding.
 
 # mlr3pipelines 0.7.1
 
 
@@ -0,0 +1,268 @@
+#' @title Reverse Factor Encoding
+#'
+#' @usage NULL
+#' @name mlr_pipeops_decode
+#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @description
+#' Reverses one-hot or treatment encoding of columns. It collapses multiple `numeric` or `integer` columns into one `factor`
+#' column based on a pre-specified grouping pattern of column names.
+#'
+#' May be applied to multiple groups of columns, grouped by matching a common naming pattern. The grouping pattern is
+#' extracted to form the name of the newly derived `factor` column, and levels are constructed from the previous column
+#' names, with parts matching the grouping pattern removed (see examples). The level per row of the new factor column is generally
+#' determined as the name of the column with the maximum value in the group.
+#'
+#' @section Construction:
+#' ```
+#' PipeOpEncode$new(id = "decode", param_vals = list())
+#' ```
+#' * `id` :: `character(1)`\cr
+#'   Identifier of resulting object, default `"decode"`.
+#' * `param_vals` :: named `list`\cr
+#'   List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
+#'
+#' @section Input and Output Channels:
+#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
+#'
+#' The output is the input [`Task`][mlr3::Task] with encoding columns collapsed into new decoded columns.
+#'
+#' @section State:
+#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
+#' * `colmaps` :: named `list`\cr
+#'   Named list of named character vectors. Each element is named according to the new column name extracted by
+#'   `group_pattern`. Each vector contains the level names for the new factor column that should be created, named by
+#'   the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `ref_name` as the
+#'   reference class with an empty string as name.
+#' * `treatment_encoding` :: `logical(1)`\cr
+#'   Value of `treatment_encoding` hyperparameter.
+#' * `cutoff` :: `numeric(1)`\cr
+#'   Value of `treatment_encoding` hyperparameter, or `0` if that is not given.
+#' * `ties_method` :: `character(1)`\cr
+#'   Value of `ties_method` hyperparameter.
+#'
+#' @section Parameters:
+#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
+#' * `group_pattern` :: `character(1)`\cr
+#'   A regular expression to be applied to column names. Should contain a capturing group for the new
+#'   column name, and match everything that should not be interpreted as the new factor levels (which are constructed as
+#'   the difference between column names and what `group_pattern` matches).
+#'   If set to `""`, all columns matching the `group_pattern` are collapsed into one factor column called
+#'   `pipeop.decoded`. Use [`PipeOpRenameColumns`] to rename this column.
+#'   Initialized to `"^([^.]+)\\."`, which would extract everything up to the first dot as the new column name and
+#'   construct new levels as everything after the first dot.
+#' * `treatment_encoding` :: `logical(1)`\cr
+#'   If `TRUE`, treatment encoding is assumed instead of one-hot encoding. Initialized to `FALSE`.
+#' * `treatment_cutoff` :: `numeric(1)`\cr
+#'   If `treatment_encoding` is `TRUE`, specifies a cutoff value for identifying the reference level. The reference level
+#'   is set to `ref_name` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all
+#'   columns in that group. Default is `0`.
+#' * `ref_name` :: `character(1)`\cr
+#'   If `treatment_encoding` is `TRUE`, specifies the name for reference levels. Default is `"ref"`.
+#' * `ties_method` :: `character(1)`\cr
+#'   Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns
+#'   with the same value is to be picked. Options are `"first"`, `"last"`, or `"random"`. Initialized to `"random"`.
+#'
+#' @section Methods:
+#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @family PipeOps
+#' @template seealso_pipeopslist
+#' @include PipeOpTaskPreproc.R
+#' @export
+#' @examples
+#' library("mlr3")
+#'
+#' # Reverse one-hot encoding
+#' df = data.frame(
+#'   target = runif(4),
+#'   x.1 = rep(c(1, 0), 2),
+#'   x.2 = rep(c(0, 1), 2),
+#'   y.1 = rep(c(1, 0), 2),
+#'   y.2 = rep(c(0, 1), 2),
+#'   a = runif(4)
+#' )
+#' task_one_hot = TaskRegr$new(id = "example", backend = df, target = "target")
+#'
+#' pop = po("decode")
+#'
+#' train_out = pop$train(list(task_one_hot))[[1]]
+#' # x.1 and x.2 are collapsed into x, same for y; a is ignored.
+#' train_out$data()
+#'
+#' # Reverse treatment encoding from PipeOpEncode
+#' df = data.frame(
+#'   target = runif(6),
+#'   fct = factor(rep(c("a", "b", "c"), 2))
+#' )
+#' task = TaskRegr$new(id = "example", backend = df, target = "target")
+#'
+#' po_enc = po("encode", method = "treatment")
+#' task_encoded = po_enc$train(list(task))[[1]]
+#' task_encoded$data()
+#'
+#' po_dec = po("decode", treatment_encoding = TRUE)
+#' task_decoded = pop$train(list(task))[[1]]
+#' # x.1 and x.2 are collapsed into x. All rows where all values
+#' # are smaller or equal to 0, the level is set to the reference level.
+#' task_decoded$data()
+#'
+#' # Different group_pattern
+#' df = data.frame(
+#'   target = runif(4),
+#'   x_1 = rep(c(1, 0), 2),
+#'   x_2 = rep(c(0, 1), 2),
+#'   y_1 = rep(c(2, 0), 2),
+#'   y_2 = rep(c(0, 1), 2)
+#' )
+#' task = TaskRegr$new(id = "example", backend = df, target = "target")
+#'
+#' # Grouped by first underscore
+#' pop = po("decode", group_pattern = "^([^_]+)\\_")
+#' train_out = pop$train(list(task))[[1]]
+#' # x_1 and x_2 are collapsed into x, same for y
+#' train_out$data()
+#'
+#' # Empty string to collapse all matches into one factor column.
+#' pop$param_set$set_values(group_pattern = "")
+#' train_out = pop$train(list(task))[[1]]
+#' # All columns are combined into a single column.
+#' # The level for each row is determined by the column with the largest value in that row.
+#' # By default, ties are resolved randomly.
+#' train_out$data()
+#'
+PipeOpDecode = R6Class("PipeOpDecode",
+  inherit = PipeOpTaskPreprocSimple,
+  public = list(
+    initialize = function(id = "decode", param_vals = list()) {
+      ps = ps(
+        group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")),
+        treatment_encoding = p_lgl(tags = c("train", "required")),
+        treatment_cutoff = p_dbl(default = 0, tags = "train", depends = quote(treatment_encoding == TRUE)),
+        ref_name = p_uty(custom_check = crate(function(x) check_string(x, min.chars = 1)), tags = "train", depends = quote(treatment_encoding == TRUE)),
+        ties_method = p_fct(c("first", "last", "random"), tags = c("train", "required"))
+      )
+      ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]+)\\.", ties_method = "random")
+      super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("integer", "numeric"))
+    }
+  ),
+  private = list(
+
+    .get_state_dt = function(dt, levels, target) {
+      pv = self$param_set$values
+      ref_name = pv$ref_name %??% "ref"
+      cols = colnames(dt)
+
+      # If pattern == "", all columns are collapsed into one column.
+      # Note, that column "pipeop.decoded" gets overwritten if it already exists.
+      if (pv$group_pattern == "") {
+        cmap = list(pipeop.decoded = set_names(cols, cols))
+
+        if (pv$treatment_encoding) {
+          # Append reference level with empty name (i.e. "")
+          cmap[["pipeop.decoded"]][[length(cols) + 1]] = get_ref_name(ref_name, cmap[["pipeop.decoded"]])
+        }
+
+        s = list(
+          colmaps = cmap,
+          treatment_encoding = pv$treatment_encoding,
+          cutoff = pv$treatment_cutoff %??% 0,
+          ties_method = pv$ties_method
+        )
+
+        return(s)
+      }
+
+      # Drop columns that do not match group_pattern
+      cols = cols[grepl(pv$group_pattern, cols, perl = TRUE)]
+
+      # Extract names for new levels
+      lvls = set_names(gsub(pv$group_pattern, "", cols, perl = TRUE), cols)
+
+      # Extract names for new factor columns to be populated with lvls
+      matches = regmatches(cols, regexec(pv$group_pattern, cols, perl = TRUE))
+      # Error, if nothing was captured.
+      if (any(lengths(matches) < 2)) {
+        stopf("Pattern %s matches column name %s, but nothing was captured. Make sure \"group_pattern\" contains a capturing group or is an empty string to collapse all colunns into one factor.",
+              str_collapse(pv$group_pattern, quote = '"'),
+              str_collapse(cols[lengths(matches) < 2], quote = '"'))
+      }
+
+      fcts = map_chr(matches, 2)
+      # Error, if no group could be extracted for an entry in col so that we could not create a column name from it.
+      if (any(nchar(fcts) == 0)) {
+        stopf("Pattern %s with column(s) %s would produce empty string as decoded column name(s). Try using a different pattern.",
+              str_collapse(pv$group_pattern, quote = '"'),
+              str_collapse(cols[nchar(fcts) == 0], quote = '"'))
+      }
+
+      # Create mapping of old column names and derived levels to new column names
+      cmap = split(lvls, fcts)
+
+      if (pv$treatment_encoding) {
+        # Append reference level with empty name (i.e. "") to all list entries
+        for (i in seq_along(cmap)) {
+          cmap[[i]][[length(cmap[[i]]) + 1]] = get_ref_name(ref_name, cmap[[i]])
+        }
+      }
+
+      list(
+        colmaps = cmap,
+        treatment_encoding = pv$treatment_encoding,
+        cutoff = pv$treatment_cutoff %??% 0,
+        ties_method = pv$ties_method
+      )
+    },
+
+    .transform_dt = function(dt, levels) {
+      colmaps = self$state$colmaps
+      if (!length(colmaps)) {
+        return(dt)  # Early exit if no mapping is required
+      }
+      cutoff = self$state$cutoff
+      ties_method = self$state$ties_method
+      treatment_encoding = self$state$treatment_encoding
+
+      dt_collapsed = data.table()
+      lapply(names(colmaps), function(new_col) {
+        lvls = colmaps[[new_col]]
+        # Get old column names and, ff existent, remove empty string element (for subsetting dt_collapse in next step)
+        old_cols = discard(names(lvls), names(lvls) == "")
+        # Create matrix from subset of dt with column names given by old_cols
+        old_cols_matrix = as.matrix(dt[, old_cols, with = FALSE])
+        # Populate new column with name of column with maximal value per row
+        set(dt_collapsed, , new_col, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)])
+        if (treatment_encoding) {
+          # If all values in old_cols_matrix are smaller than or equal to the cutoff, replace with empty string
+          # This leads to replacement with reference level in next step.
+          set(dt_collapsed, which(rowSums(old_cols_matrix > cutoff) == 0), new_col, "")
+        }
+        # Replace occurrences of old column names with corresponding new level names
+        set(dt_collapsed, , new_col, factor(lvls[match(dt_collapsed[[new_col]], names(lvls))], levels = lvls))
+      })
+
+      # Drop old columns (if existent, remove empty string elements, to allow subsetting)
+      drop = unlist(lapply(colmaps, names))
+      drop = discard(drop, drop == "")
+      dt[, (drop) := NULL]
+
+      # cbind new columns
+      do.call(cbind, list(dt, dt_collapsed))
+    }
+  )
+)
+
+mlr_pipeops$add("decode", PipeOpDecode)
+
+# Ensures the reference level name is unique for a given factor by appending an incrementing suffix if needed.
+# * ref_name: name of the reference level by default
+# * lvl_names: all other level names for a given factor
+get_ref_name = function(ref_name, lvl_names) {
+  new_ref_name = ref_name
+  counter = 1
+  while (new_ref_name %in% lvl_names) {
+    new_ref_name = paste0(ref_name, ".", counter)
+    counter = counter + 1
+  }
+  new_ref_name
+}
@@ -1,11 +1,11 @@
-#' @title Kernelized Principle Component Analysis
+#' @title Kernelized Principal Component Analysis
 #'
 #' @usage NULL
 #' @name mlr_pipeops_kernelpca
 #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`].
 #'
 #' @description
-#' Extracts kernel principle components from data. Only affects numerical features.
+#' Extracts kernel principal components from data. Only affects numerical features.
 #' See [kernlab::kpca] for details.
 #'
 #' @section Construction:
 
@@ -83,7 +83,7 @@ po.PipeOp = function(.obj, ...) {
 
 #' @export
 po.character = function(.obj, ...) {
-  dictionary_sugar_inc_get(dict = mlr_pipeops, .key = .obj, ...)
+  dictionary_sugar_inc_get(dict = mlr_pipeops, .key = .obj, ..., .dicts_suggest = list("ppl()" = mlr_graphs))
 }
 
 #' @export
@@ -111,7 +111,7 @@ pos.NULL = function(.objs, ...) {
 
 #' @export
 pos.character = function(.objs, ...) {
-  dictionary_sugar_inc_mget(dict = mlr_pipeops, .keys = .objs, ...)
+  dictionary_sugar_inc_mget(dict = mlr_pipeops, .keys = .objs, ..., .dicts_suggest = list("ppls()" = mlr_graphs))
 }
 
 #' @export
 
@@ -23,12 +23,12 @@
 #' gr = ppl("bagging", graph = po(lrn("regr.rpart")),
 #'   averager = po("regravg", collect_multiplicity = TRUE))
 ppl = function(.key, ...) {
-  dictionary_sugar_get(dict = mlr_graphs, .key = .key, ...)
+  dictionary_sugar_get(dict = mlr_graphs, .key = .key, ..., .dicts_suggest = list("po()" = mlr_pipeops))
 }
 
 #' @export
 #' @rdname ppl
 ppls = function(.keys, ...) {
   if (missing(.keys)) return(mlr_graphs)
-  map(.x = .keys, .f = dictionary_sugar_get, dict = mlr_graphs, ...)
+  map(.x = .keys, .f = dictionary_sugar_get, dict = mlr_graphs, ..., .dicts_suggest = list("pos()" = mlr_pipeops))
 }
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ po.PipeOp = function(.obj, ...) {`
`83`	`83`
`84`	`84`	`#' @export`
`85`	`85`	`po.character = function(.obj, ...) {`
`86`		`- dictionary_sugar_inc_get(dict = mlr_pipeops, .key = .obj, ...)`
	`86`	`+ dictionary_sugar_inc_get(dict = mlr_pipeops, .key = .obj, ..., .dicts_suggest = list("ppl()" = mlr_graphs))`
`87`	`87`	`}`
`88`	`88`
`89`	`89`	`#' @export`
`@@ -111,7 +111,7 @@ pos.NULL = function(.objs, ...) {`
`111`	`111`
`112`	`112`	`#' @export`
`113`	`113`	`pos.character = function(.objs, ...) {`
`114`		`- dictionary_sugar_inc_mget(dict = mlr_pipeops, .keys = .objs, ...)`
	`114`	`+ dictionary_sugar_inc_mget(dict = mlr_pipeops, .keys = .objs, ..., .dicts_suggest = list("ppls()" = mlr_graphs))`
`115`	`115`	`}`
`116`	`116`
`117`	`117`	`#' @export`
Original file line number	Diff line number	Diff line change
`@@ -23,12 +23,12 @@`
`23`	`23`	`#' gr = ppl("bagging", graph = po(lrn("regr.rpart")),`
`24`	`24`	`#' averager = po("regravg", collect_multiplicity = TRUE))`
`25`	`25`	`ppl = function(.key, ...) {`
`26`		`- dictionary_sugar_get(dict = mlr_graphs, .key = .key, ...)`
	`26`	`+ dictionary_sugar_get(dict = mlr_graphs, .key = .key, ..., .dicts_suggest = list("po()" = mlr_pipeops))`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`#' @export`
`30`	`30`	`#' @rdname ppl`
`31`	`31`	`ppls = function(.keys, ...) {`
`32`	`32`	`if (missing(.keys)) return(mlr_graphs)`
`33`		`- map(.x = .keys, .f = dictionary_sugar_get, dict = mlr_graphs, ...)`
	`33`	`+ map(.x = .keys, .f = dictionary_sugar_get, dict = mlr_graphs, ..., .dicts_suggest = list("pos()" = mlr_pipeops))`
`34`	`34`	`}`