From 8223497bade0899cb1e6603140cadf327d1380c1 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 14 Mar 2024 19:18:55 +0100 Subject: [PATCH 01/43] feat: ensemble feature selection --- DESCRIPTION | 1 + NAMESPACE | 1 + R/ensemble_fselect.R | 67 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 R/ensemble_fselect.R diff --git a/DESCRIPTION b/DESCRIPTION index 5af055c1..5b638898 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,6 +68,7 @@ Collate: 'assertions.R' 'auto_fselector.R' 'bibentries.R' + 'ensemble_fselect.R' 'extract_inner_fselect_archives.R' 'extract_inner_fselect_results.R' 'fselect.R' diff --git a/NAMESPACE b/NAMESPACE index 6e326f11..05027c4d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ export(auto_fselector) export(callback_fselect) export(clbk) export(clbks) +export(ensemble_fselect) export(extract_inner_fselect_archives) export(extract_inner_fselect_results) export(fs) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R new file mode 100644 index 00000000..42ff17cb --- /dev/null +++ b/R/ensemble_fselect.R @@ -0,0 +1,67 @@ +#' @export +ensemble_fselect = function(task, learners, outer_resampling, inner_resampling, fselector, terminator) { + assert_task(task) + assert_learners(learners) + assert_resampling(outer_resampling) + assert_resampling(inner_resampling) + assert_fselector(fselector) + assert_terminator(terminator) + + outer_resampling$instantiate(task) + + grid = map_dtr(seq(outer_resampling$iters), function(i) { + + afs = auto_fselector( + fselector = fselector, + learner = learners[[i]], + resampling = inner_resampling, + measure = measure, + terminator = terminator, + store_models = TRUE + ) + + task_subset = task$clone()$filter(outer_resampling$train_set(i)) + resampling = rsmp("insample")$instantiate(task_subset) + + data.table( + iter = i, + base_learner_id = learners[[i]]$id, + base_learner = list(learners[[i]]), + learner = list(afs), + task = list(task_subset), + resampling = list(resampling) + ) + }) + + design = grid[, list(learner, task, resampling)] + + bmr = benchmark(design, store_models = TRUE) + + # extract + afss = bmr$score()$learner + features = map(afss, function(afs) { + afs$fselect_result$features[[1]] + }) + + n_features = map_int(afss, function(afs) { + afs$fselect_result$n_features[[1]] + }) + + set(grid, j = "features", value = features) + set(grid, j = "n_features", value = n_features) + + grid +} + +if (FALSE) { + task = tsk("sonar") + learners = lrns(c("classif.rpart", "classif.rpart")) + outer_resampling = rsmp("subsampling", repeats = 2) + inner_resampling = rsmp("cv", folds = 3) + measure = msr("classif.ce") + fselector = fs("random_search") + terminator = trm("evals", n_evals = 10) + + ensemble_fselect(task, learners, outer_resampling, inner_resampling, fselector, terminator) + +} From e06f8c9247c5e0d8862ce32c0ac03e52e6883144 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 4 Apr 2024 19:02:27 +0200 Subject: [PATCH 02/43] docs: improve documentation --- R/ensemble_fselect.R | 56 ++++++++++++++++++-------- man/ensemble_fselect.Rd | 56 ++++++++++++++++++++++++++ tests/testthat/test_ensemble_fselect.R | 15 +++++++ 3 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 man/ensemble_fselect.Rd create mode 100644 tests/testthat/test_ensemble_fselect.R diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 42ff17cb..85ee2c01 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -1,12 +1,49 @@ +#' @title Ensemble Feature Selection +#' +#' @description +#' Ensemble feature selection using multiple learners. +#' +#' @param learners (list of [mlr3::Learner])\cr +#' The learners to be used for feature selection. +#' @param outer_resampling ([mlr3::Resampling])\cr +#' The outer resampling strategy. +#' The number of iterations must match the number of learners. +#' @param inner_resampling ([mlr3::Resampling])\cr +#' The inner resampling strategy used by the [FSelector]. +#' +#' @template param_fselector +#' @template param_task +#' @template param_measure +#' @template param_terminator +#' #' @export -ensemble_fselect = function(task, learners, outer_resampling, inner_resampling, fselector, terminator) { +#' @examples +#' \donttest{ +#' +#' ensemble_fselect( +#' fselector = fs("random_search"), +#' task = tsk("sonar"), +#' learners = lrns(c("classif.rpart", "classif.featureless")), +#' outer_resampling = rsmp("subsampling", repeats = 2), +#' inner_resampling = rsmp("cv", folds = 3), +#' measure = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 10) +#' ) +#' } +ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_resampling, measure, terminator) { assert_task(task) - assert_learners(learners) + assert_learners(as_learners(learners), task = task) assert_resampling(outer_resampling) assert_resampling(inner_resampling) + assert_measure(measure) assert_fselector(fselector) assert_terminator(terminator) + if (length(learners) != outer_resampling$iters) { + stopf("Number of learners %i must match number of outer resampling iterations %i.", + length(learners), outer_resampling$iters) + } + outer_resampling$instantiate(task) grid = map_dtr(seq(outer_resampling$iters), function(i) { @@ -33,7 +70,7 @@ ensemble_fselect = function(task, learners, outer_resampling, inner_resampling, ) }) - design = grid[, list(learner, task, resampling)] + design = grid[, c("learner", "task", "resampling"), with = FALSE] bmr = benchmark(design, store_models = TRUE) @@ -52,16 +89,3 @@ ensemble_fselect = function(task, learners, outer_resampling, inner_resampling, grid } - -if (FALSE) { - task = tsk("sonar") - learners = lrns(c("classif.rpart", "classif.rpart")) - outer_resampling = rsmp("subsampling", repeats = 2) - inner_resampling = rsmp("cv", folds = 3) - measure = msr("classif.ce") - fselector = fs("random_search") - terminator = trm("evals", n_evals = 10) - - ensemble_fselect(task, learners, outer_resampling, inner_resampling, fselector, terminator) - -} diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd new file mode 100644 index 00000000..d5439996 --- /dev/null +++ b/man/ensemble_fselect.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ensemble_fselect.R +\name{ensemble_fselect} +\alias{ensemble_fselect} +\title{Ensemble Feature Selection} +\usage{ +ensemble_fselect( + fselector, + task, + learners, + outer_resampling, + inner_resampling, + measure, + terminator +) +} +\arguments{ +\item{fselector}{(\link{FSelector})\cr +Optimization algorithm.} + +\item{task}{(\link[mlr3:Task]{mlr3::Task})\cr +Task to operate on.} + +\item{learners}{(list of \link[mlr3:Learner]{mlr3::Learner})\cr +The learners to be used for feature selection.} + +\item{outer_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr +The outer resampling strategy. +The number of iterations must match the number of learners.} + +\item{inner_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr +The inner resampling strategy used by the \link{FSelector}.} + +\item{measure}{(\link[mlr3:Measure]{mlr3::Measure})\cr +Measure to optimize. If \code{NULL}, default measure is used.} + +\item{terminator}{(\link{Terminator})\cr +Stop criterion of the feature selection.} +} +\description{ +Ensemble feature selection using multiple learners. +} +\examples{ +\donttest{ + + ensemble_fselect( + fselector = fs("random_search"), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + outer_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 10) + ) +} +} diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R new file mode 100644 index 00000000..65a53f18 --- /dev/null +++ b/tests/testthat/test_ensemble_fselect.R @@ -0,0 +1,15 @@ +test_that("esemble feature selection works", { + res = ensemble_fselect( + fselector = fs("random_search"), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + outer_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 10) + ) + + expect_data_table(res, nrows = 2) +}) + + From 2b724aeb90d145bc2021a76d535e86fbeb38f357 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 10 Apr 2024 15:30:53 +0200 Subject: [PATCH 03/43] fix: outer iterations times learners --- R/ensemble_fselect.R | 26 ++++++++++++-------------- tests/testthat/test_ensemble_fselect.R | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 85ee2c01..e3597b77 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -39,32 +39,30 @@ ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_r assert_fselector(fselector) assert_terminator(terminator) - if (length(learners) != outer_resampling$iters) { - stopf("Number of learners %i must match number of outer resampling iterations %i.", - length(learners), outer_resampling$iters) - } - - outer_resampling$instantiate(task) - - grid = map_dtr(seq(outer_resampling$iters), function(i) { - - afs = auto_fselector( + # create fselector for each learner + afss = map(learners, function(learner) { + auto_fselector( fselector = fselector, - learner = learners[[i]], + learner = learner, resampling = inner_resampling, measure = measure, terminator = terminator, store_models = TRUE ) + }) + + outer_resampling$instantiate(task) + grid = map_dtr(seq(outer_resampling$iters), function(i) { + # create task and resampling for each outer iteration task_subset = task$clone()$filter(outer_resampling$train_set(i)) resampling = rsmp("insample")$instantiate(task_subset) data.table( iter = i, - base_learner_id = learners[[i]]$id, - base_learner = list(learners[[i]]), - learner = list(afs), + base_learner_id = map(learners, "id"), + base_learner = learners, + learner = afss, task = list(task_subset), resampling = list(resampling) ) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 65a53f18..00a28f55 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -9,7 +9,7 @@ test_that("esemble feature selection works", { terminator = trm("evals", n_evals = 10) ) - expect_data_table(res, nrows = 2) + expect_data_table(res, nrows = 4) }) From 2e05bd3d85a0b730932ee5081445b510d38e826c Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 10 Apr 2024 15:46:11 +0200 Subject: [PATCH 04/43] feat: allow callbacks --- R/ensemble_fselect.R | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index e3597b77..644ea0b4 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -30,14 +30,10 @@ #' terminator = trm("evals", n_evals = 10) #' ) #' } -ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_resampling, measure, terminator) { +ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_resampling, measure, terminator, callbacks = list()) { assert_task(task) assert_learners(as_learners(learners), task = task) assert_resampling(outer_resampling) - assert_resampling(inner_resampling) - assert_measure(measure) - assert_fselector(fselector) - assert_terminator(terminator) # create fselector for each learner afss = map(learners, function(learner) { @@ -47,7 +43,8 @@ ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_r resampling = inner_resampling, measure = measure, terminator = terminator, - store_models = TRUE + store_models = TRUE, + callbacks = callbacks ) }) From 12a78fdab07ca9144141796d688094ed57ae5b64 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 10 Apr 2024 15:53:01 +0200 Subject: [PATCH 05/43] docs: callback --- R/ensemble_fselect.R | 1 + man/ensemble_fselect.Rd | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 644ea0b4..c75aba29 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -15,6 +15,7 @@ #' @template param_task #' @template param_measure #' @template param_terminator +#' @template param_callbacks #' #' @export #' @examples diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index d5439996..f5679063 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -11,7 +11,8 @@ ensemble_fselect( outer_resampling, inner_resampling, measure, - terminator + terminator, + callbacks = list() ) } \arguments{ @@ -36,6 +37,9 @@ Measure to optimize. If \code{NULL}, default measure is used.} \item{terminator}{(\link{Terminator})\cr Stop criterion of the feature selection.} + +\item{callbacks}{(list of \link{CallbackFSelect})\cr +List of callbacks.} } \description{ Ensemble feature selection using multiple learners. From b8e28308b53b7fe8765b4899b7c96ee889d9f057 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 18 Apr 2024 14:23:17 +0200 Subject: [PATCH 06/43] feat: add store_models option --- R/ensemble_fselect.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index c75aba29..9e37245d 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -31,7 +31,17 @@ #' terminator = trm("evals", n_evals = 10) #' ) #' } -ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_resampling, measure, terminator, callbacks = list()) { +ensemble_fselect = function( + fselector, + task, + learners, + outer_resampling, + inner_resampling, + measure, + terminator, + callbacks = list(), + store_models = TRUE + ) { assert_task(task) assert_learners(as_learners(learners), task = task) assert_resampling(outer_resampling) @@ -44,7 +54,7 @@ ensemble_fselect = function(fselector, task, learners, outer_resampling, inner_r resampling = inner_resampling, measure = measure, terminator = terminator, - store_models = TRUE, + store_models = store_models, callbacks = callbacks ) }) From 6408dd3cd3dba276ebbe3927509b6984572f4e39 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 18 Apr 2024 14:50:12 +0200 Subject: [PATCH 07/43] feat: add scores --- R/ensemble_fselect.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 9e37245d..7d7424f6 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -80,18 +80,27 @@ ensemble_fselect = function( bmr = benchmark(design, store_models = TRUE) - # extract + afss = bmr$score()$learner + + # extract features features = map(afss, function(afs) { afs$fselect_result$features[[1]] }) + # extract n_features n_features = map_int(afss, function(afs) { afs$fselect_result$n_features[[1]] }) + # extract scores + scores = map_dbl(afss, function(afs) { + afs$fselect_instance$archive$best()[, measure$id, with = FALSE][[1]] + }) + set(grid, j = "features", value = features) set(grid, j = "n_features", value = n_features) + set(grid, j = measure$id, value = scores) grid } From f545bac116f004ff317f09e0c9654dd8073aa4b7 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 15:41:00 +0200 Subject: [PATCH 08/43] refactor input arg + add doc --- R/ensemble_fselect.R | 38 ++++++++++++++++++++++++++++---------- man/ensemble_fselect.Rd | 29 +++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 7d7424f6..d15b4349 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -2,12 +2,29 @@ #' #' @description #' Ensemble feature selection using multiple learners. +#' The ensemble feature selection method is designed to identify the +#' most informative features from a given dataset by leveraging multiple +#' machine learning models and resampling techniques. +#' +#' @details +#' The method begins by applying an initial resampling technique specified +#' by the user, to create **multiple subsamples** from the original dataset. +#' This resampling process helps in generating diverse subsets of data for +#' robust feature selection. +#' +#' For each subsample generated in the previous step, the method performs +#' **wrapped-based feature selection** using each provided learner, an inner +#' resampling method and a performance measure. +#' This process generates a best feature subset for each combination of +#' subsample and learner. +#' Results are stored in a [data.table] object. #' #' @param learners (list of [mlr3::Learner])\cr #' The learners to be used for feature selection. -#' @param outer_resampling ([mlr3::Resampling])\cr -#' The outer resampling strategy. -#' The number of iterations must match the number of learners. +#' @param init_resampling ([mlr3::Resampling])\cr +#' The initial resampling strategy of the data, from which each train set +#' will be passed on to the learners. +#' Can only be [mlr_resamplings_subsampling] or [mlr_resamplings_bootstrap]. #' @param inner_resampling ([mlr3::Resampling])\cr #' The inner resampling strategy used by the [FSelector]. #' @@ -25,7 +42,7 @@ #' fselector = fs("random_search"), #' task = tsk("sonar"), #' learners = lrns(c("classif.rpart", "classif.featureless")), -#' outer_resampling = rsmp("subsampling", repeats = 2), +#' init_resampling = rsmp("subsampling", repeats = 2), #' inner_resampling = rsmp("cv", folds = 3), #' measure = msr("classif.ce"), #' terminator = trm("evals", n_evals = 10) @@ -35,7 +52,7 @@ ensemble_fselect = function( fselector, task, learners, - outer_resampling, + init_resampling, inner_resampling, measure, terminator, @@ -44,7 +61,9 @@ ensemble_fselect = function( ) { assert_task(task) assert_learners(as_learners(learners), task = task) - assert_resampling(outer_resampling) + assert_resampling(init_resampling) + assert_choice(class(init_resampling)[1], + choices = c("ResamplingBootstrap", "ResamplingSubsampling")) # create fselector for each learner afss = map(learners, function(learner) { @@ -59,11 +78,11 @@ ensemble_fselect = function( ) }) - outer_resampling$instantiate(task) - grid = map_dtr(seq(outer_resampling$iters), function(i) { + init_resampling$instantiate(task) + grid = map_dtr(seq(init_resampling$iters), function(i) { # create task and resampling for each outer iteration - task_subset = task$clone()$filter(outer_resampling$train_set(i)) + task_subset = task$clone()$filter(init_resampling$train_set(i)) resampling = rsmp("insample")$instantiate(task_subset) data.table( @@ -80,7 +99,6 @@ ensemble_fselect = function( bmr = benchmark(design, store_models = TRUE) - afss = bmr$score()$learner # extract features diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index f5679063..bdcf9e39 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -8,11 +8,12 @@ ensemble_fselect( fselector, task, learners, - outer_resampling, + init_resampling, inner_resampling, measure, terminator, - callbacks = list() + callbacks = list(), + store_models = TRUE ) } \arguments{ @@ -25,9 +26,10 @@ Task to operate on.} \item{learners}{(list of \link[mlr3:Learner]{mlr3::Learner})\cr The learners to be used for feature selection.} -\item{outer_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr -The outer resampling strategy. -The number of iterations must match the number of learners.} +\item{init_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr +The initial resampling strategy of the data, from which each train set +will be passed on to the learners. +Can only be \link{mlr_resamplings_subsampling} or \link{mlr_resamplings_bootstrap}.} \item{inner_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr The inner resampling strategy used by the \link{FSelector}.} @@ -43,6 +45,21 @@ List of callbacks.} } \description{ Ensemble feature selection using multiple learners. +The ensemble feature selection method is designed to identify the +most informative features from a given dataset by leveraging multiple +machine learning models and resampling techniques. +} +\details{ +The method begins by applying an initial resampling technique specified +by the user, to create multiple subsamples from the original dataset. +This resampling process helps in generating diverse subsets of data for +robust feature selection. +For each subsample generated in the previous step, the method performs +wrapped-based feature selection using each provided learner, an inner +resampling method and a performance measure. +This process generates a best feature subset for each combination of +subsample and learner. +All the results are stored in a \link{data.table} object. } \examples{ \donttest{ @@ -51,7 +68,7 @@ Ensemble feature selection using multiple learners. fselector = fs("random_search"), task = tsk("sonar"), learners = lrns(c("classif.rpart", "classif.featureless")), - outer_resampling = rsmp("subsampling", repeats = 2), + init_resampling = rsmp("subsampling", repeats = 2), inner_resampling = rsmp("cv", folds = 3), measure = msr("classif.ce"), terminator = trm("evals", n_evals = 10) From 13f52ff315950f729599c9203cd9c418c1339dde Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 16:56:23 +0200 Subject: [PATCH 09/43] better doc --- R/ensemble_fselect.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index d15b4349..a113eac5 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -13,8 +13,9 @@ #' robust feature selection. #' #' For each subsample generated in the previous step, the method performs -#' **wrapped-based feature selection** using each provided learner, an inner -#' resampling method and a performance measure. +#' **wrapped-based feature selection** ([auto_fselector]) using each provided +#' learner, the given inner resampling method, performance measure and +#' optimization algorithm. #' This process generates a best feature subset for each combination of #' subsample and learner. #' Results are stored in a [data.table] object. @@ -27,6 +28,8 @@ #' Can only be [mlr_resamplings_subsampling] or [mlr_resamplings_bootstrap]. #' @param inner_resampling ([mlr3::Resampling])\cr #' The inner resampling strategy used by the [FSelector]. +#' @param store_model (`logical(1)`)\cr +#' Whether to store models in [auto_fselector] or not. #' #' @template param_fselector #' @template param_task @@ -37,7 +40,6 @@ #' @export #' @examples #' \donttest{ -#' #' ensemble_fselect( #' fselector = fs("random_search"), #' task = tsk("sonar"), @@ -45,7 +47,7 @@ #' init_resampling = rsmp("subsampling", repeats = 2), #' inner_resampling = rsmp("cv", folds = 3), #' measure = msr("classif.ce"), -#' terminator = trm("evals", n_evals = 10) +#' terminator = trm("evals", n_evals = 5) #' ) #' } ensemble_fselect = function( From 9f0edf3623510f19b27a8493f6a23a8b82061345 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 16:57:36 +0200 Subject: [PATCH 10/43] remove base_learner, correct iter --- R/ensemble_fselect.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index a113eac5..dbceddf3 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -89,8 +89,7 @@ ensemble_fselect = function( data.table( iter = i, - base_learner_id = map(learners, "id"), - base_learner = learners, + learner_id = map(learners, "id"), learner = afss, task = list(task_subset), resampling = list(resampling) @@ -118,6 +117,7 @@ ensemble_fselect = function( afs$fselect_instance$archive$best()[, measure$id, with = FALSE][[1]] }) + set(grid, j = "iter", value = 1:bmr$n_resample_results) set(grid, j = "features", value = features) set(grid, j = "n_features", value = n_features) set(grid, j = measure$id, value = scores) From 7bbefd0d42a7218c275d09235aaa3c9faf7c90c0 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 17:11:55 +0200 Subject: [PATCH 11/43] revert back example --- R/ensemble_fselect.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index dbceddf3..f568d112 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -47,7 +47,7 @@ #' init_resampling = rsmp("subsampling", repeats = 2), #' inner_resampling = rsmp("cv", folds = 3), #' measure = msr("classif.ce"), -#' terminator = trm("evals", n_evals = 5) +#' terminator = trm("evals", n_evals = 10) #' ) #' } ensemble_fselect = function( From dcf6728a2df7f4cd18f836c138f7a549094c8995 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 19:03:01 +0200 Subject: [PATCH 12/43] get importance scores from RFE --- R/ensemble_fselect.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index f568d112..422a4b77 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -122,5 +122,13 @@ ensemble_fselect = function( set(grid, j = "n_features", value = n_features) set(grid, j = measure$id, value = scores) + # extract importance scores if RFE optimization was used + if (class(fselector)[1] == "FSelectorRFE") { + imp_scores = map(afss, function(afs) { + afs$fselect_result$importance[[1]] + }) + set(grid, j = "importance", value = imp_scores) + } + grid } From 82e9f7fb533b8747cceaaa33f0c8796614d5f775 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 19:04:17 +0200 Subject: [PATCH 13/43] update docs --- man/ensemble_fselect.Rd | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index bdcf9e39..7e609c28 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -42,6 +42,9 @@ Stop criterion of the feature selection.} \item{callbacks}{(list of \link{CallbackFSelect})\cr List of callbacks.} + +\item{store_model}{(\code{logical(1)})\cr +Whether to store models in \link{auto_fselector} or not.} } \description{ Ensemble feature selection using multiple learners. @@ -51,19 +54,20 @@ machine learning models and resampling techniques. } \details{ The method begins by applying an initial resampling technique specified -by the user, to create multiple subsamples from the original dataset. +by the user, to create \strong{multiple subsamples} from the original dataset. This resampling process helps in generating diverse subsets of data for robust feature selection. + For each subsample generated in the previous step, the method performs -wrapped-based feature selection using each provided learner, an inner -resampling method and a performance measure. +\strong{wrapped-based feature selection} (\link{auto_fselector}) using each provided +learner, the given inner resampling method, performance measure and +optimization algorithm. This process generates a best feature subset for each combination of subsample and learner. -All the results are stored in a \link{data.table} object. +Results are stored in a \link{data.table} object. } \examples{ \donttest{ - ensemble_fselect( fselector = fs("random_search"), task = tsk("sonar"), From a563ec81425ac4b53912d87f492da5919ad3bff6 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 19:11:31 +0200 Subject: [PATCH 14/43] update test --- tests/testthat/test_ensemble_fselect.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 00a28f55..bebac790 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -1,15 +1,19 @@ -test_that("esemble feature selection works", { +test_that("ensemble feature selection works", { res = ensemble_fselect( - fselector = fs("random_search"), + fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), task = tsk("sonar"), learners = lrns(c("classif.rpart", "classif.featureless")), - outer_resampling = rsmp("subsampling", repeats = 2), + init_resampling = rsmp("subsampling", repeats = 2), inner_resampling = rsmp("cv", folds = 3), measure = msr("classif.ce"), - terminator = trm("evals", n_evals = 10) + terminator = trm("none") ) expect_data_table(res, nrows = 4) + expect_list(res$features, any.missing = FALSE, len = 4) + expect_vector(res$n_features, size = 4) + expect_vector(res$classif.ce, size = 4) + expect_list(res$importance, any.missing = FALSE, len = 4) }) From b3f1678ebb9c0b4e491698bc50a13bf31551f9ef Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 22:38:33 +0200 Subject: [PATCH 15/43] fix typo --- R/ensemble_fselect.R | 2 +- man/ensemble_fselect.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 422a4b77..2edb698b 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -28,7 +28,7 @@ #' Can only be [mlr_resamplings_subsampling] or [mlr_resamplings_bootstrap]. #' @param inner_resampling ([mlr3::Resampling])\cr #' The inner resampling strategy used by the [FSelector]. -#' @param store_model (`logical(1)`)\cr +#' @param store_models (`logical(1)`)\cr #' Whether to store models in [auto_fselector] or not. #' #' @template param_fselector diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 7e609c28..0393ff6d 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -43,7 +43,7 @@ Stop criterion of the feature selection.} \item{callbacks}{(list of \link{CallbackFSelect})\cr List of callbacks.} -\item{store_model}{(\code{logical(1)})\cr +\item{store_models}{(\code{logical(1)})\cr Whether to store models in \link{auto_fselector} or not.} } \description{ From f16a62106b54075ec86fe2645a050d6727df15bf Mon Sep 17 00:00:00 2001 From: john Date: Thu, 16 May 2024 23:03:06 +0200 Subject: [PATCH 16/43] fix warning 'Missing link' --- R/ensemble_fselect.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 2edb698b..0f58ccf2 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -1,5 +1,7 @@ #' @title Ensemble Feature Selection #' +#' @include CallbackFSelect.R +#' #' @description #' Ensemble feature selection using multiple learners. #' The ensemble feature selection method is designed to identify the @@ -67,7 +69,7 @@ ensemble_fselect = function( assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling")) - # create fselector for each learner + # create auto_fselector for each learner afss = map(learners, function(learner) { auto_fselector( fselector = fselector, From be13f33c69acf08f1c5f2fb88a492c2e71ea5745 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 21 May 2024 11:57:40 +0200 Subject: [PATCH 17/43] fixes after main merge --- R/ensemble_fselect.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 0f58ccf2..021956af 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -1,6 +1,6 @@ #' @title Ensemble Feature Selection #' -#' @include CallbackFSelect.R +#' @include CallbackBatchFSelect.R #' #' @description #' Ensemble feature selection using multiple learners. @@ -125,7 +125,7 @@ ensemble_fselect = function( set(grid, j = measure$id, value = scores) # extract importance scores if RFE optimization was used - if (class(fselector)[1] == "FSelectorRFE") { + if (class(fselector)[1] == "FSelectorBatchRFE") { imp_scores = map(afss, function(afs) { afs$fselect_result$importance[[1]] }) From 3f4b684d3941a57d08cdc448c1ed555f8c0fcb3b Mon Sep 17 00:00:00 2001 From: john Date: Tue, 21 May 2024 11:58:56 +0200 Subject: [PATCH 18/43] updocs --- man/AutoFSelector.Rd | 1 - man/ensemble_fselect.Rd | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/man/AutoFSelector.Rd b/man/AutoFSelector.Rd index bc2c8bb8..383a14ee 100644 --- a/man/AutoFSelector.Rd +++ b/man/AutoFSelector.Rd @@ -149,7 +149,6 @@ Hash (unique identifier) for this partial object, excluding some components whic \if{html}{\out{
Inherited methods
    -
  • mlr3::Learner$estimate_memory_usage()
  • mlr3::Learner$format()
  • mlr3::Learner$help()
  • mlr3::Learner$predict()
  • diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 0393ff6d..24878d94 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -40,7 +40,7 @@ Measure to optimize. If \code{NULL}, default measure is used.} \item{terminator}{(\link{Terminator})\cr Stop criterion of the feature selection.} -\item{callbacks}{(list of \link{CallbackFSelect})\cr +\item{callbacks}{(list of \link{CallbackBatchFSelect})\cr List of callbacks.} \item{store_models}{(\code{logical(1)})\cr From 150d9a3d0823cf858a1778abdbc8ef6b966ffd32 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 21 May 2024 17:13:54 +0200 Subject: [PATCH 19/43] fix bug in one-se callback and refactor --- R/mlr_callbacks.R | 19 ++++++++++++++----- man/mlr3fselect.one_se_rule.Rd | 4 +++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/R/mlr_callbacks.R b/R/mlr_callbacks.R index 99bad922..ea2d26cf 100644 --- a/R/mlr_callbacks.R +++ b/R/mlr_callbacks.R @@ -116,7 +116,9 @@ load_callback_svm_rfe = function() { #' #' @description #' Selects the smallest feature set within one standard error of the best as the result. -#' If there are multiple feature sets with the same performance and number of features, the first one is selected. +#' If there are multiple such feature sets with the same number of features, the first one is selected. +#' If the sets have exactly the same performance but different number of features, +#' the one with the smallest number of features is selected. #' #' @source #' `r format_bib("kuhn2013")` @@ -152,10 +154,17 @@ load_callback_one_se_rule = function() { y = data[[archive$cols_y]] se = sd(y) / sqrt(length(y)) - # select smallest future set within one standard error of the best - best_y = context$instance$result_y - data = data[y > best_y - se & y < best_y + se, ][which.min(n_features)] - context$instance$.__enclos_env__$private$.result = data[, setdiff(names(context$instance$result), "x_domain"), with = FALSE] + columns_to_keep = setdiff(names(context$instance$result), "x_domain") + if (se == 0) { + # select smallest future set when all scores are the same + context$instance$.__enclos_env__$private$.result = + data[,columns_to_keep, with = FALSE][which.min(n_features)] + } else { + # select smallest future set within one standard error of the best + best_y = context$instance$result_y + context$instance$.__enclos_env__$private$.result = + data[y > best_y - se & y < best_y + se, columns_to_keep, with = FALSE][which.min(n_features)] + } } ) } diff --git a/man/mlr3fselect.one_se_rule.Rd b/man/mlr3fselect.one_se_rule.Rd index 90faa355..ccea397b 100644 --- a/man/mlr3fselect.one_se_rule.Rd +++ b/man/mlr3fselect.one_se_rule.Rd @@ -12,7 +12,9 @@ ISBN 978-1-4614-6849-3. } \description{ Selects the smallest feature set within one standard error of the best as the result. -If there are multiple feature sets with the same performance and number of features, the first one is selected. +If there are multiple such feature sets with the same number of features, the first one is selected. +If the sets have exactly the same performance but different number of features, +the one with the smallest number of features is selected. } \examples{ clbk("mlr3fselect.one_se_rule") From f0b1098c58f012235d5e75a0eec0099bfe9de7f4 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 22 May 2024 14:09:10 +0200 Subject: [PATCH 20/43] add citations --- R/bibentries.R | 40 +++++++++++++++++++++++++++++++++++++++- R/ensemble_fselect.R | 2 ++ man/ensemble_fselect.Rd | 16 ++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/R/bibentries.R b/R/bibentries.R index bf93a7e6..a7ecb1df 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -66,6 +66,44 @@ bibentries = c( address = "New York, NY", pages = "61--92", isbn = "978-1-4614-6849-3" + ), + + saeys2008 = bibentry("article", + author = "Saeys, Yvan and Abeel, Thomas and Van De Peer, Yves", + doi = "10.1007/978-3-540-87481-2_21", + isbn = "3540874801", + journal = "Machine Learning and Knowledge Discovery in Databases", + pages = "313--325", + publisher = "Springer, Berlin, Heidelberg", + title = "Robust feature selection using ensemble feature selection techniques", + volume = "5212 LNAI", + year = "2008" + ), + + abeel2010 = bibentry("article", + author = "Abeel, Thomas and Helleputte, Thibault and Van de Peer, Yves and Dupont, Pierre and Saeys, Yvan", + doi = "10.1093/BIOINFORMATICS/BTP630", + issn = "1367-4803", + journal = "Bioinformatics", + month = "feb", + pages = "392--398", + publisher = "Oxford Academic", + title = "Robust biomarker identification for cancer diagnosis with ensemble feature selection methods", + volume = "26", + year = "2010" + ), + + pes2020 = bibentry("article", + author = "Pes, Barbara", + doi = "10.1007/s00521-019-04082-3", + issn = "14333058", + journal = "Neural Computing and Applications", + month = "may", + number = "10", + pages = "5951--5973", + publisher = "Springer", + title = "Ensemble feature selection for high-dimensional data: a stability analysis across multiple domains", + volume = "32", + year = "2020" ) ) - diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 021956af..93b4c2ab 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -39,6 +39,8 @@ #' @template param_terminator #' @template param_callbacks #' +#' @source +#' `r format_bib("saeys2008", "abeel2010", "pes2020")` #' @export #' @examples #' \donttest{ diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 24878d94..f92d82b7 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -3,6 +3,22 @@ \name{ensemble_fselect} \alias{ensemble_fselect} \title{Ensemble Feature Selection} +\source{ +Saeys, Yvan, Abeel, Thomas, Van De Peer, Yves (2008). +\dQuote{Robust feature selection using ensemble feature selection techniques.} +\emph{Machine Learning and Knowledge Discovery in Databases}, \bold{5212 LNAI}, 313--325. +\doi{10.1007/978-3-540-87481-2_21}. + +Abeel, Thomas, Helleputte, Thibault, Van de Peer, Yves, Dupont, Pierre, Saeys, Yvan (2010). +\dQuote{Robust biomarker identification for cancer diagnosis with ensemble feature selection methods.} +\emph{Bioinformatics}, \bold{26}, 392--398. +ISSN 1367-4803, \doi{10.1093/BIOINFORMATICS/BTP630}. + +Pes, Barbara (2020). +\dQuote{Ensemble feature selection for high-dimensional data: a stability analysis across multiple domains.} +\emph{Neural Computing and Applications}, \bold{32}(10), 5951--5973. +ISSN 14333058, \doi{10.1007/s00521-019-04082-3}. +} \usage{ ensemble_fselect( fselector, From a50d0405189e5f85b93d2df91cd03b5b307acc01 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 11:27:44 +0200 Subject: [PATCH 21/43] feat: add result object --- DESCRIPTION | 4 +- R/EnsembleFSResult.R | 75 +++++++++++++++ R/ensemble_fselect.R | 2 +- man/EnsembleFSResult.Rd | 123 +++++++++++++++++++++++++ tests/testthat/test_ensemble_fselect.R | 27 ++++-- 5 files changed, 223 insertions(+), 8 deletions(-) create mode 100644 R/EnsembleFSResult.R create mode 100644 man/EnsembleFSResult.Rd diff --git a/DESCRIPTION b/DESCRIPTION index bc854ec2..2559273d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,8 @@ Imports: lgr, mlr3misc (>= 0.15.0.9000), paradox (>= 1.0.0), - R6 + R6, + stabm Suggests: e1071, genalg, @@ -55,6 +56,7 @@ Collate: 'AutoFSelector.R' 'CallbackBatchFSelect.R' 'ContextBatchFSelect.R' + 'EnsembleFSResult.R' 'FSelectInstanceBatchSingleCrit.R' 'FSelectInstanceBatchMultiCrit.R' 'mlr_fselectors.R' diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R new file mode 100644 index 00000000..80794c0b --- /dev/null +++ b/R/EnsembleFSResult.R @@ -0,0 +1,75 @@ +#' @title Ensemble Feature Selection Result +#' +#' @description +#' The `EnsembleFSResult` stores the results of the ensemble feature selection. +#' The function [ensemble_fselect()] returns an object of this class. +#' +#' @examples +#' \donttest{ +#' efsr = ensemble_fselect( +#' fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), +#' task = tsk("sonar"), +#' learners = lrns(c("classif.rpart", "classif.featureless")), +#' init_resampling = rsmp("subsampling", repeats = 2), +#' inner_resampling = rsmp("cv", folds = 3), +#' measure = msr("classif.ce"), +#' terminator = trm("none") +#' ) +#' +#' # contains the benchmark result +#' efsr$benchmark_result +#' +#' # contains the selected features for each iteration +#' efsr$grid +#' +#' # returns the stability of the selected features +#' efsr$stability(stability_measure = "jaccard") +#' } +EnsembleFSResult = R6Class("EnsembleFSResult", + public = list( + + #' @field benchmark_result (`BenchmarkResult`)\cr + #' The benchmark result object. + benchmark_result = NULL, + + #' @field grid (`data.table`)\cr + #' The grid of feature selection results. + grid = NULL, + + #' @description + #' Creates a new instance of this [R6][R6::R6Class] class. + #' + #' @param benchmark_result (`BenchmarkResult`)\cr + #' The benchmark result object. + #' @param grid (`data.table`)\cr + #' The grid of feature selection results. + initialize = function(benchmark_result, grid) { + self$benchmark_result = assert_benchmark_result(benchmark_result) + self$grid = assert_data_table(grid) + }, + + #' @description + #' Returns the feature ranking. + feature_ranking = function() { + + }, + + #' @description + #' Calculates the stability of the selected features with the `stabm` package. + #' + #' @param stability_measure (`character(1)`)\cr + #' The stability measure to be used. + #' One of the measures returned by [stabm::listStabilityMeasures()] in lower case. + #' Default is `"jaccard"`. + #' @param ... (`any`)\cr + #' Additional arguments passed to the stability measure function. + stability = function(stability_measure = "jaccard", ...) { + funs = stabm::listStabilityMeasures()$Name + keys = tolower(gsub("stability", "", funs)) + assert_choice(stability_measure, choices = keys) + + fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) + fun(self$grid$features, ...) + } + ) +) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 93b4c2ab..2244ce38 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -134,5 +134,5 @@ ensemble_fselect = function( set(grid, j = "importance", value = imp_scores) } - grid + EnsembleFSResult$new(bmr, grid) } diff --git a/man/EnsembleFSResult.Rd b/man/EnsembleFSResult.Rd new file mode 100644 index 00000000..6c679277 --- /dev/null +++ b/man/EnsembleFSResult.Rd @@ -0,0 +1,123 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/EnsembleFSResult.R +\name{EnsembleFSResult} +\alias{EnsembleFSResult} +\title{Ensemble Feature Selection Result} +\description{ +The \code{EnsembleFSResult} stores the results of the ensemble feature selection. +The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. +} +\examples{ +\donttest{ +efsr = ensemble_fselect( + fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("none") +) + +# contains the benchmark result +efsr$benchmark_result + +# contains the selected features for each iteration +efsr$grid + +# returns the stability of the selected features +efsr$stability(stability_measure = "jaccard") +} +} +\section{Public fields}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr +The benchmark result object.} + +\item{\code{grid}}{(\code{data.table})\cr +The grid of feature selection results.} +} +\if{html}{\out{
    }} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-EnsembleFSResult-new}{\code{EnsembleFSResult$new()}} +\item \href{#method-EnsembleFSResult-feature_ranking}{\code{EnsembleFSResult$feature_ranking()}} +\item \href{#method-EnsembleFSResult-stability}{\code{EnsembleFSResult$stability()}} +\item \href{#method-EnsembleFSResult-clone}{\code{EnsembleFSResult$clone()}} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-new}{}}} +\subsection{Method \code{new()}}{ +Creates a new instance of this \link[R6:R6Class]{R6} class. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result, grid)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr +The benchmark result object.} + +\item{\code{grid}}{(\code{data.table})\cr +The grid of feature selection results.} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-feature_ranking}{}}} +\subsection{Method \code{feature_ranking()}}{ +Returns the feature ranking. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$feature_ranking()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-stability}{}}} +\subsection{Method \code{stability()}}{ +Calculates the stability of the selected features with the \code{stabm} package. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$stability(stability_measure = "jaccard", ...)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{stability_measure}}{(\code{character(1)})\cr +The stability measure to be used. +One of the measures returned by \code{\link[stabm:listStabilityMeasures]{stabm::listStabilityMeasures()}} in lower case. +Default is \code{"jaccard"}.} + +\item{\code{...}}{(\code{any})\cr +Additional arguments passed to the stability measure function.} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$clone(deep = FALSE)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
    }} +} +} +} diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index bebac790..39be1201 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -1,5 +1,5 @@ test_that("ensemble feature selection works", { - res = ensemble_fselect( + efsr = ensemble_fselect( fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), task = tsk("sonar"), learners = lrns(c("classif.rpart", "classif.featureless")), @@ -9,11 +9,26 @@ test_that("ensemble feature selection works", { terminator = trm("none") ) - expect_data_table(res, nrows = 4) - expect_list(res$features, any.missing = FALSE, len = 4) - expect_vector(res$n_features, size = 4) - expect_vector(res$classif.ce, size = 4) - expect_list(res$importance, any.missing = FALSE, len = 4) + expect_data_table(efsr$grid, nrows = 4) + expect_list(efsr$grid$features, any.missing = FALSE, len = 4) + expect_vector(efsr$grid$n_features, size = 4) + expect_vector(efsr$grid$classif.ce, size = 4) + expect_list(efsr$grid$importance, any.missing = FALSE, len = 4) + expect_benchmark_result(efsr$benchmark_result) +}) + +test_that("stability method works", { + efsr = ensemble_fselect( + fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("none") + ) + + expect_number(efsr$stability(stability_measure = "jaccard")) }) From a391bc3740e31043eb47a151a6b0d42a97928fe4 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 31 May 2024 11:32:30 +0200 Subject: [PATCH 22/43] add John as author --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2559273d..f4fe8cdf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,9 @@ Authors@R: c( person("Michel", "Lang", , "michellang@gmail.com", role = "aut", comment = c(ORCID = "0000-0001-9754-0393")), person("Bernd", "Bischl", , "bernd_bischl@gmx.net", role = "aut", - comment = c(ORCID = "0000-0001-6002-6980")) + comment = c(ORCID = "0000-0001-6002-6980")), + person("John", "Zobolas", , "bblodfon@gmail.com", role = "aut", + comment = c(ORCID = "0000-0002-3609-8674")) ) Description: Feature selection package of the 'mlr3' ecosystem. It selects the optimal feature set for any 'mlr3' learner. The package works with From 4bcc8a9bc9fbc689b4ba7af3e8530ef81e4096c8 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 11:45:34 +0200 Subject: [PATCH 23/43] refactor: remove r6 objects from grid --- NAMESPACE | 1 + R/EnsembleFSResult.R | 38 ++++++++++++++++++-------- R/ensemble_fselect.R | 3 ++ man/EnsembleFSResult.Rd | 21 ++++++++------ tests/testthat/test_ensemble_fselect.R | 10 +++---- 5 files changed, 49 insertions(+), 24 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9fe1de0a..85d360a8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method(as.data.table,ArchiveBatchFSelect) S3method(as.data.table,DictionaryFSelector) +S3method(as.data.table,EnsembleFSResult) S3method(extract_inner_fselect_archives,BenchmarkResult) S3method(extract_inner_fselect_archives,ResampleResult) S3method(extract_inner_fselect_results,BenchmarkResult) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 80794c0b..68731a45 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -20,7 +20,7 @@ #' efsr$benchmark_result #' #' # contains the selected features for each iteration -#' efsr$grid +#' efsr$result #' #' # returns the stability of the selected features #' efsr$stability(stability_measure = "jaccard") @@ -29,23 +29,19 @@ EnsembleFSResult = R6Class("EnsembleFSResult", public = list( #' @field benchmark_result (`BenchmarkResult`)\cr - #' The benchmark result object. + #' The benchmark result. benchmark_result = NULL, - #' @field grid (`data.table`)\cr - #' The grid of feature selection results. - grid = NULL, - #' @description #' Creates a new instance of this [R6][R6::R6Class] class. #' #' @param benchmark_result (`BenchmarkResult`)\cr #' The benchmark result object. - #' @param grid (`data.table`)\cr - #' The grid of feature selection results. - initialize = function(benchmark_result, grid) { + #' @param result ([data.table::data.table])\cr + #' The result of the ensemble feature selection results. + initialize = function(benchmark_result, result) { self$benchmark_result = assert_benchmark_result(benchmark_result) - self$grid = assert_data_table(grid) + private$.result = assert_data_table(result) }, #' @description @@ -69,7 +65,27 @@ EnsembleFSResult = R6Class("EnsembleFSResult", assert_choice(stability_measure, choices = keys) fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - fun(self$grid$features, ...) + fun(self$result$features, ...) + } + ), + + active = list( + + #' @field result ([data.table::data.table])\cr + #' Returns the result of the ensemble feature selection. + result = function(rhs) { + assert_ro_binding(rhs) + tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE] + cbind(private$.result, tab) } + ), + + private = list( + .result = NULL ) ) + +#' @export +as.data.table.EnsembleFSResult = function(x, ...) { + x$result +} diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 2244ce38..2f0fdf71 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -134,5 +134,8 @@ ensemble_fselect = function( set(grid, j = "importance", value = imp_scores) } + set(grid, j = "learner", value = NULL) + set(grid, j = "task", value = NULL) + set(grid, j = "resampling", value = NULL) EnsembleFSResult$new(bmr, grid) } diff --git a/man/EnsembleFSResult.Rd b/man/EnsembleFSResult.Rd index 6c679277..306047d9 100644 --- a/man/EnsembleFSResult.Rd +++ b/man/EnsembleFSResult.Rd @@ -23,7 +23,7 @@ efsr = ensemble_fselect( efsr$benchmark_result # contains the selected features for each iteration -efsr$grid +efsr$result # returns the stability of the selected features efsr$stability(stability_measure = "jaccard") @@ -33,10 +33,15 @@ efsr$stability(stability_measure = "jaccard") \if{html}{\out{
    }} \describe{ \item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr -The benchmark result object.} - -\item{\code{grid}}{(\code{data.table})\cr -The grid of feature selection results.} +The benchmark result.} +} +\if{html}{\out{
    }} +} +\section{Active bindings}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr +Returns the result of the ensemble feature selection.} } \if{html}{\out{
    }} } @@ -55,7 +60,7 @@ The grid of feature selection results.} \subsection{Method \code{new()}}{ Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result, grid)}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result, result)}\if{html}{\out{
    }} } \subsection{Arguments}{ @@ -64,8 +69,8 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr The benchmark result object.} -\item{\code{grid}}{(\code{data.table})\cr -The grid of feature selection results.} +\item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr +The result of the ensemble feature selection results.} } \if{html}{\out{}} } diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 39be1201..02ea0ced 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -9,11 +9,11 @@ test_that("ensemble feature selection works", { terminator = trm("none") ) - expect_data_table(efsr$grid, nrows = 4) - expect_list(efsr$grid$features, any.missing = FALSE, len = 4) - expect_vector(efsr$grid$n_features, size = 4) - expect_vector(efsr$grid$classif.ce, size = 4) - expect_list(efsr$grid$importance, any.missing = FALSE, len = 4) + expect_data_table(efsr$result, nrows = 4) + expect_list(efsr$result$features, any.missing = FALSE, len = 4) + expect_vector(efsr$result$n_features, size = 4) + expect_vector(efsr$result$classif.ce, size = 4) + expect_list(efsr$result$importance, any.missing = FALSE, len = 4) expect_benchmark_result(efsr$benchmark_result) }) From 62011f30e5c377697413f72b2de52ceef03c8c42 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 12:06:17 +0200 Subject: [PATCH 24/43] feat: add feature_ranking method --- R/EnsembleFSResult.R | 21 +++++++++++++++++++-- man/EnsembleFSResult.Rd | 13 +++++++++++-- man/mlr3fselect-package.Rd | 1 + tests/testthat/test_ensemble_fselect.R | 17 +++-------------- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 68731a45..99ed0ab7 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -45,9 +45,26 @@ EnsembleFSResult = R6Class("EnsembleFSResult", }, #' @description - #' Returns the feature ranking. - feature_ranking = function() { + #' Calculates the feature ranking. + #' + #' @param method (`character(1)`)\cr + #' The method to calculate the feature ranking. + #' Currently, only `"inclusion_probability"` is supported. + feature_ranking = function(method = "inclusion_probability") { + assert_choice(method, choices = "inclusion_probability") + + features = self$benchmark_result$tasks$task[[1]]$feature_names + + count = map_int(features, function(feature) { + sum(map_lgl(self$result$features, function(iteration) { + feature %in% iteration + })) + }) + + res = data.table(feature = features, inclusion_probability = count / nrow(self$result)) + setorderv(res, "inclusion_probability", order = -1L) + res }, #' @description diff --git a/man/EnsembleFSResult.Rd b/man/EnsembleFSResult.Rd index 306047d9..15e70ed2 100644 --- a/man/EnsembleFSResult.Rd +++ b/man/EnsembleFSResult.Rd @@ -79,11 +79,20 @@ The result of the ensemble feature selection results.} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-EnsembleFSResult-feature_ranking}{}}} \subsection{Method \code{feature_ranking()}}{ -Returns the feature ranking. +Calculates the feature ranking. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$feature_ranking()}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$feature_ranking(method = "inclusion_probability")}\if{html}{\out{
    }} } +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{method}}{(\code{character(1)})\cr +The method to calculate the feature ranking. +Currently, only \code{"inclusion_probability"} is supported.} +} +\if{html}{\out{
    }} +} } \if{html}{\out{
    }} \if{html}{\out{}} diff --git a/man/mlr3fselect-package.Rd b/man/mlr3fselect-package.Rd index 2548f8f9..51086bdc 100644 --- a/man/mlr3fselect-package.Rd +++ b/man/mlr3fselect-package.Rd @@ -27,6 +27,7 @@ Authors: \item Patrick Schratz \email{patrick.schratz@gmail.com} (\href{https://orcid.org/0000-0003-0748-6624}{ORCID}) \item Michel Lang \email{michellang@gmail.com} (\href{https://orcid.org/0000-0001-9754-0393}{ORCID}) \item Bernd Bischl \email{bernd_bischl@gmx.net} (\href{https://orcid.org/0000-0001-6002-6980}{ORCID}) + \item John Zobolas \email{bblodfon@gmail.com} (\href{https://orcid.org/0000-0002-3609-8674}{ORCID}) } } diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 02ea0ced..74ff1678 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -15,20 +15,9 @@ test_that("ensemble feature selection works", { expect_vector(efsr$result$classif.ce, size = 4) expect_list(efsr$result$importance, any.missing = FALSE, len = 4) expect_benchmark_result(efsr$benchmark_result) -}) - -test_that("stability method works", { - efsr = ensemble_fselect( - fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), - task = tsk("sonar"), - learners = lrns(c("classif.rpart", "classif.featureless")), - init_resampling = rsmp("subsampling", repeats = 2), - inner_resampling = rsmp("cv", folds = 3), - measure = msr("classif.ce"), - terminator = trm("none") - ) expect_number(efsr$stability(stability_measure = "jaccard")) + feature_ranking = efsr$feature_ranking() + expect_data_table(feature_ranking, nrows = 60) + expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) }) - - From df1fd1567f228105b463f9969cb41c1bf9f20b50 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 12:22:06 +0200 Subject: [PATCH 25/43] feat: cache results --- R/EnsembleFSResult.R | 30 ++++++++++++++++++++++++------ man/EnsembleFSResult.Rd | 15 ++++++++++++--- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 99ed0ab7..a5dc32bf 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -28,14 +28,14 @@ EnsembleFSResult = R6Class("EnsembleFSResult", public = list( - #' @field benchmark_result (`BenchmarkResult`)\cr + #' @field benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result. benchmark_result = NULL, #' @description #' Creates a new instance of this [R6][R6::R6Class] class. #' - #' @param benchmark_result (`BenchmarkResult`)\cr + #' @param benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result object. #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection results. @@ -53,6 +53,11 @@ EnsembleFSResult = R6Class("EnsembleFSResult", feature_ranking = function(method = "inclusion_probability") { assert_choice(method, choices = "inclusion_probability") + # cached results + if (!is.null(private$.feature_ranking[[method]])) { + return(private$.feature_ranking[[method]]) + } + features = self$benchmark_result$tasks$task[[1]]$feature_names count = map_int(features, function(feature) { @@ -64,11 +69,14 @@ EnsembleFSResult = R6Class("EnsembleFSResult", res = data.table(feature = features, inclusion_probability = count / nrow(self$result)) setorderv(res, "inclusion_probability", order = -1L) - res + private$.feature_ranking[[method]] = res + private$.feature_ranking[[method]] }, #' @description #' Calculates the stability of the selected features with the `stabm` package. + #' The results are cached. + #' When the same stability measure is requested again with different arguments, the cache must be reset. #' #' @param stability_measure (`character(1)`)\cr #' The stability measure to be used. @@ -76,13 +84,21 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Default is `"jaccard"`. #' @param ... (`any`)\cr #' Additional arguments passed to the stability measure function. - stability = function(stability_measure = "jaccard", ...) { + #' @param reset_cache (`logical(1)`)\cr + #' If `TRUE`, the cached results are ignored. + stability = function(stability_measure = "jaccard", ..., reset_cache = FALSE) { funs = stabm::listStabilityMeasures()$Name keys = tolower(gsub("stability", "", funs)) assert_choice(stability_measure, choices = keys) + # cached results + if (!is.null(private$.stability[[stability_measure]]) && !reset_cache) { + return(private$.stability[[stability_measure]]) + } + fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - fun(self$result$features, ...) + private$.stability[[stability_measure]] = fun(self$result$features, ...) + private$.stability[[stability_measure]] } ), @@ -98,7 +114,9 @@ EnsembleFSResult = R6Class("EnsembleFSResult", ), private = list( - .result = NULL + .result = NULL, + .stability = NULL, + .feature_ranking = NULL ) ) diff --git a/man/EnsembleFSResult.Rd b/man/EnsembleFSResult.Rd index 15e70ed2..f854803b 100644 --- a/man/EnsembleFSResult.Rd +++ b/man/EnsembleFSResult.Rd @@ -32,7 +32,7 @@ efsr$stability(stability_measure = "jaccard") \section{Public fields}{ \if{html}{\out{
    }} \describe{ -\item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr +\item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr The benchmark result.} } \if{html}{\out{
    }} @@ -66,7 +66,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ -\item{\code{benchmark_result}}{(\code{BenchmarkResult})\cr +\item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr The benchmark result object.} \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr @@ -99,8 +99,14 @@ Currently, only \code{"inclusion_probability"} is supported.} \if{latex}{\out{\hypertarget{method-EnsembleFSResult-stability}{}}} \subsection{Method \code{stability()}}{ Calculates the stability of the selected features with the \code{stabm} package. +The results are cached. +When the same stability measure is requested again with different arguments, the cache must be reset. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$stability(stability_measure = "jaccard", ...)}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$stability( + stability_measure = "jaccard", + ..., + reset_cache = FALSE +)}\if{html}{\out{
    }} } \subsection{Arguments}{ @@ -113,6 +119,9 @@ Default is \code{"jaccard"}.} \item{\code{...}}{(\code{any})\cr Additional arguments passed to the stability measure function.} + +\item{\code{reset_cache}}{(\code{logical(1)})\cr +If \code{TRUE}, the cached results are ignored.} } \if{html}{\out{
    }} } From bb55020ea66acb07cd58e53a39b8e99b1f11a91c Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 12:51:11 +0200 Subject: [PATCH 26/43] feat: allow different callbacks --- R/ensemble_fselect.R | 14 ++++++++------ tests/testthat/test_ensemble_fselect.R | 12 ++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 2f0fdf71..0feef2f9 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -32,12 +32,14 @@ #' The inner resampling strategy used by the [FSelector]. #' @param store_models (`logical(1)`)\cr #' Whether to store models in [auto_fselector] or not. +#' @param callbacks (list of lists of [CallbackBatchFSelect])\cr +#' Callbacks to be used for each learner. +#' The lists must have the same length as the number of learners. #' #' @template param_fselector #' @template param_task #' @template param_measure #' @template param_terminator -#' @template param_callbacks #' #' @source #' `r format_bib("saeys2008", "abeel2010", "pes2020")` @@ -62,17 +64,17 @@ ensemble_fselect = function( inner_resampling, measure, terminator, - callbacks = list(), + callbacks = NULL, store_models = TRUE ) { assert_task(task) assert_learners(as_learners(learners), task = task) assert_resampling(init_resampling) - assert_choice(class(init_resampling)[1], - choices = c("ResamplingBootstrap", "ResamplingSubsampling")) + assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling")) + assert_list(callbacks, types = "list", len = length(learners), null.ok = TRUE) # create auto_fselector for each learner - afss = map(learners, function(learner) { + afss = imap(unname(learners), function(learner, i) { auto_fselector( fselector = fselector, learner = learner, @@ -80,7 +82,7 @@ ensemble_fselect = function( measure = measure, terminator = terminator, store_models = store_models, - callbacks = callbacks + callbacks = callbacks[[i]] ) }) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 74ff1678..cb3eed5d 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -21,3 +21,15 @@ test_that("ensemble feature selection works", { expect_data_table(feature_ranking, nrows = 60) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) }) + +test_that("different callbacks can be set", { + efsr = ensemble_fselect( + fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("none") + ) +}) From 3502ebd4be0505a19082e31025de5827a60949a1 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 31 May 2024 14:47:21 +0200 Subject: [PATCH 27/43] fix: callbacks --- R/ContextBatchFSelect.R | 16 ++++++------ R/EnsembleFSResult.R | 16 ++++++++++++ R/ObjectiveFSelectBatch.R | 3 +++ man/EnsembleFSResult.Rd | 36 ++++++++++++++++++++++++++ man/ensemble_fselect.Rd | 7 ++--- tests/testthat/test_ensemble_fselect.R | 13 +++++++++- 6 files changed, 79 insertions(+), 12 deletions(-) diff --git a/R/ContextBatchFSelect.R b/R/ContextBatchFSelect.R index bba0c7f7..b69b6015 100644 --- a/R/ContextBatchFSelect.R +++ b/R/ContextBatchFSelect.R @@ -19,9 +19,9 @@ ContextBatchFSelect = R6Class("ContextBatchFSelect", #' The feature sets of the latest batch. xss = function(rhs) { if (missing(rhs)) { - return(get_private(self$objective_fselect)$.xss) + return(get_private(self$instance$objective)$.xss) } else { - get_private(self$objective_fselect)$.xss = rhs + get_private(self$instance$objective)$.xss = rhs } }, @@ -29,9 +29,9 @@ ContextBatchFSelect = R6Class("ContextBatchFSelect", #' The benchmark design of the latest batch. design = function(rhs) { if (missing(rhs)) { - return(get_private(self$objective_fselect)$.design) + return(get_private(self$instance$objective)$.design) } else { - get_private(self$objective_fselect)$.design = rhs + get_private(self$instance$objective)$.design = rhs } }, @@ -39,9 +39,9 @@ ContextBatchFSelect = R6Class("ContextBatchFSelect", #' The benchmark result of the latest batch. benchmark_result = function(rhs) { if (missing(rhs)) { - return(get_private(self$objective_fselect)$.benchmark_result) + return(get_private(self$instance$objective)$.benchmark_result) } else { - get_private(self$objective_fselect)$.benchmark_result = rhs + get_private(self$instance$objective)$.benchmark_result = rhs } }, @@ -51,9 +51,9 @@ ContextBatchFSelect = R6Class("ContextBatchFSelect", #' A callback can add additional columns which are also written to the archive. aggregated_performance = function(rhs) { if (missing(rhs)) { - return(get_private(self$objective_fselect)$.aggregated_performance) + return(get_private(self$instance$objective)$.aggregated_performance) } else { - get_private(self$objective_fselect)$.aggregated_performance = rhs + get_private(self$instance$objective)$.aggregated_performance = rhs } } ) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index a5dc32bf..b4c98101 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -44,6 +44,22 @@ EnsembleFSResult = R6Class("EnsembleFSResult", private$.result = assert_data_table(result) }, + #' @description + #' Helper for print outputs. + #' @param ... (ignored). + format = function(...) { + sprintf("<%s>", class(self)[1L]) + }, + + #' @description + #' Printer. + #' + #' @param ... (ignored). + print = function(...) { + catf(format(self)) + print(self$result[, c("learner_id", "n_features"), with = FALSE]) + }, + #' @description #' Calculates the feature ranking. #' diff --git a/R/ObjectiveFSelectBatch.R b/R/ObjectiveFSelectBatch.R index 61a95c34..975410c8 100644 --- a/R/ObjectiveFSelectBatch.R +++ b/R/ObjectiveFSelectBatch.R @@ -89,6 +89,9 @@ ObjectiveFSelectBatch = R6Class("ObjectiveFSelectBatch", self$archive$benchmark_result$combine(private$.benchmark_result) set(private$.aggregated_performance, j = "uhash", value = private$.benchmark_result$uhashes) } + + call_back("on_eval_before_archive", self$callbacks, self$context) + private$.aggregated_performance }, diff --git a/man/EnsembleFSResult.Rd b/man/EnsembleFSResult.Rd index f854803b..6b52dc5a 100644 --- a/man/EnsembleFSResult.Rd +++ b/man/EnsembleFSResult.Rd @@ -49,6 +49,8 @@ Returns the result of the ensemble feature selection.} \subsection{Public methods}{ \itemize{ \item \href{#method-EnsembleFSResult-new}{\code{EnsembleFSResult$new()}} +\item \href{#method-EnsembleFSResult-format}{\code{EnsembleFSResult$format()}} +\item \href{#method-EnsembleFSResult-print}{\code{EnsembleFSResult$print()}} \item \href{#method-EnsembleFSResult-feature_ranking}{\code{EnsembleFSResult$feature_ranking()}} \item \href{#method-EnsembleFSResult-stability}{\code{EnsembleFSResult$stability()}} \item \href{#method-EnsembleFSResult-clone}{\code{EnsembleFSResult$clone()}} @@ -76,6 +78,40 @@ The result of the ensemble feature selection results.} } } \if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-format}{}}} +\subsection{Method \code{format()}}{ +Helper for print outputs. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$format(...)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-print}{}}} +\subsection{Method \code{print()}}{ +Printer. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$print(...)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-EnsembleFSResult-feature_ranking}{}}} \subsection{Method \code{feature_ranking()}}{ diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index f92d82b7..4111b0af 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -28,7 +28,7 @@ ensemble_fselect( inner_resampling, measure, terminator, - callbacks = list(), + callbacks = NULL, store_models = TRUE ) } @@ -56,8 +56,9 @@ Measure to optimize. If \code{NULL}, default measure is used.} \item{terminator}{(\link{Terminator})\cr Stop criterion of the feature selection.} -\item{callbacks}{(list of \link{CallbackBatchFSelect})\cr -List of callbacks.} +\item{callbacks}{(list of lists of \link{CallbackBatchFSelect})\cr +Callbacks to be used for each learner. +The lists must have the same length as the number of learners.} \item{store_models}{(\code{logical(1)})\cr Whether to store models in \link{auto_fselector} or not.} diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index cb3eed5d..3ef87036 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -23,6 +23,13 @@ test_that("ensemble feature selection works", { }) test_that("different callbacks can be set", { + + callback_test = callback_batch_fselect("mlr3fselect.test", + on_eval_before_archive = function(callback, context) { + context$aggregated_performance[, callback_active := context$instance$objective$learner$id == "classif.rpart"] + } + ) + efsr = ensemble_fselect( fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), task = tsk("sonar"), @@ -30,6 +37,10 @@ test_that("different callbacks can be set", { init_resampling = rsmp("subsampling", repeats = 2), inner_resampling = rsmp("cv", folds = 3), measure = msr("classif.ce"), - terminator = trm("none") + terminator = trm("none"), + callbacks = list(list(callback_test), list()) ) + + expect_true(all(efsr$benchmark_result$score()$learner[[1]]$fselect_instance$archive$data$callback_active)) + expect_null(efsr$benchmark_result$score()$learner[[2]]$fselect_instance$archive$data$callback_active) }) From 9532ab9fc778de5dd5a5be88c238ed89d59c731c Mon Sep 17 00:00:00 2001 From: john Date: Thu, 6 Jun 2024 12:29:37 +0200 Subject: [PATCH 28/43] add help() method --- R/EnsembleFSResult.R | 18 ++++++++++++++- ...embleFSResult.Rd => ensemble_fs_result.Rd} | 22 +++++++++++++++++-- tests/testthat/test_ensemble_fselect.R | 6 +++-- 3 files changed, 41 insertions(+), 5 deletions(-) rename man/{EnsembleFSResult.Rd => ensemble_fs_result.Rd} (89%) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index b4c98101..f2a2c05d 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -1,7 +1,12 @@ #' @title Ensemble Feature Selection Result #' +#' @name ensemble_fs_result +#' #' @description -#' The `EnsembleFSResult` stores the results of the ensemble feature selection. +#' The `EnsembleFSResult` stores the results of the ensemble feature selection +#' and incorporates methods for assessing the stability of the feature selection +#' and ranking the features. +#' #' The function [ensemble_fselect()] returns an object of this class. #' #' @examples @@ -32,6 +37,10 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' The benchmark result. benchmark_result = NULL, + #' @field man (`character(1)`)\cr + #' Manual page for this object. + man = NULL, + #' @description #' Creates a new instance of this [R6][R6::R6Class] class. #' @@ -42,6 +51,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", initialize = function(benchmark_result, result) { self$benchmark_result = assert_benchmark_result(benchmark_result) private$.result = assert_data_table(result) + self$man = "mlr3fselect::ensemble_fs_result" }, #' @description @@ -60,6 +70,12 @@ EnsembleFSResult = R6Class("EnsembleFSResult", print(self$result[, c("learner_id", "n_features"), with = FALSE]) }, + #' @description + #' Opens the corresponding help page referenced by field `$man`. + help = function() { + open_help(self$man) + }, + #' @description #' Calculates the feature ranking. #' diff --git a/man/EnsembleFSResult.Rd b/man/ensemble_fs_result.Rd similarity index 89% rename from man/EnsembleFSResult.Rd rename to man/ensemble_fs_result.Rd index 6b52dc5a..d78f216b 100644 --- a/man/EnsembleFSResult.Rd +++ b/man/ensemble_fs_result.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/EnsembleFSResult.R -\name{EnsembleFSResult} +\name{ensemble_fs_result} +\alias{ensemble_fs_result} \alias{EnsembleFSResult} \title{Ensemble Feature Selection Result} \description{ -The \code{EnsembleFSResult} stores the results of the ensemble feature selection. +The \code{EnsembleFSResult} stores the results of the ensemble feature selection +and incorporates methods for assessing the stability of the feature selection +and ranking the features. + The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. } \examples{ @@ -34,6 +38,9 @@ efsr$stability(stability_measure = "jaccard") \describe{ \item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr The benchmark result.} + +\item{\code{man}}{(\code{character(1)})\cr +Manual page for this object.} } \if{html}{\out{}} } @@ -51,6 +58,7 @@ Returns the result of the ensemble feature selection.} \item \href{#method-EnsembleFSResult-new}{\code{EnsembleFSResult$new()}} \item \href{#method-EnsembleFSResult-format}{\code{EnsembleFSResult$format()}} \item \href{#method-EnsembleFSResult-print}{\code{EnsembleFSResult$print()}} +\item \href{#method-EnsembleFSResult-help}{\code{EnsembleFSResult$help()}} \item \href{#method-EnsembleFSResult-feature_ranking}{\code{EnsembleFSResult$feature_ranking()}} \item \href{#method-EnsembleFSResult-stability}{\code{EnsembleFSResult$stability()}} \item \href{#method-EnsembleFSResult-clone}{\code{EnsembleFSResult$clone()}} @@ -110,6 +118,16 @@ Printer. } \if{html}{\out{}} } +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-help}{}}} +\subsection{Method \code{help()}}{ +Opens the corresponding help page referenced by field \verb{$man}. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$help()}\if{html}{\out{
    }} +} + } \if{html}{\out{
    }} \if{html}{\out{}} diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 3ef87036..c75a778c 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -1,7 +1,8 @@ test_that("ensemble feature selection works", { + task = tsk("sonar") efsr = ensemble_fselect( fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), - task = tsk("sonar"), + task = task, learners = lrns(c("classif.rpart", "classif.featureless")), init_resampling = rsmp("subsampling", repeats = 2), inner_resampling = rsmp("cv", folds = 3), @@ -9,6 +10,7 @@ test_that("ensemble feature selection works", { terminator = trm("none") ) + expect_character(efsr$man) expect_data_table(efsr$result, nrows = 4) expect_list(efsr$result$features, any.missing = FALSE, len = 4) expect_vector(efsr$result$n_features, size = 4) @@ -18,7 +20,7 @@ test_that("ensemble feature selection works", { expect_number(efsr$stability(stability_measure = "jaccard")) feature_ranking = efsr$feature_ranking() - expect_data_table(feature_ranking, nrows = 60) + expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) }) From c17dff904cc2b2d7b904d39a8eeb86b076ed153a Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 10:15:14 +0200 Subject: [PATCH 29/43] return result without R6 classes --- R/EnsembleFSResult.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index f2a2c05d..bdd152c8 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -120,7 +120,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' If `TRUE`, the cached results are ignored. stability = function(stability_measure = "jaccard", ..., reset_cache = FALSE) { funs = stabm::listStabilityMeasures()$Name - keys = tolower(gsub("stability", "", funs)) + keys = tolower(gsub("stability", "", funs)) assert_choice(stability_measure, choices = keys) # cached results @@ -140,8 +140,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Returns the result of the ensemble feature selection. result = function(rhs) { assert_ro_binding(rhs) - tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE] - cbind(private$.result, tab) + private$.result } ), From 766f10278cd8c54ada96de588561a6ec11e92130 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 14:27:03 +0200 Subject: [PATCH 30/43] add task features in initialize() --- R/EnsembleFSResult.R | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index bdd152c8..1eaf2118 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -48,9 +48,23 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' The benchmark result object. #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection results. - initialize = function(benchmark_result, result) { - self$benchmark_result = assert_benchmark_result(benchmark_result) - private$.result = assert_data_table(result) + #' @param features ([character()])\cr + #' The vector of features of the task that was used in the ensemble feature + #' selection. Ignored if `benchmark_result` is given and mandatory to have + #' if `benchmark_result` is `NULL`. + initialize = function(benchmark_result = NULL, result, features) { + if (is.null(benchmark_result)) { + assert_character(features, any.missing = FALSE, null.ok = FALSE) + private$.features = features + } else { + self$benchmark_result = assert_benchmark_result(benchmark_result) + private$.features = self$benchmark_result$tasks$task[[1]]$feature_names + } + + assert_data_table(result) + assert_names(names(result), must.include = c("iter", "learner_id", "features", "n_features")) + + private$.result = result self$man = "mlr3fselect::ensemble_fs_result" }, @@ -147,7 +161,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult", private = list( .result = NULL, .stability = NULL, - .feature_ranking = NULL + .feature_ranking = NULL, + .features = NULL ) ) From e90b1b2de83b4c87b15fa53cdf260ab487f9202e Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 14:29:27 +0200 Subject: [PATCH 31/43] faster calculation of inclusion probabilities --- R/EnsembleFSResult.R | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 1eaf2118..4865130b 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -96,6 +96,10 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' @param method (`character(1)`)\cr #' The method to calculate the feature ranking. #' Currently, only `"inclusion_probability"` is supported. + #' + #' @return A [data.table][data.table::data.table] listing all the features, + #' ordered by decreasing inclusion probability scores (depending on the + #' `method`) feature_ranking = function(method = "inclusion_probability") { assert_choice(method, choices = "inclusion_probability") @@ -104,16 +108,21 @@ EnsembleFSResult = R6Class("EnsembleFSResult", return(private$.feature_ranking[[method]]) } - features = self$benchmark_result$tasks$task[[1]]$feature_names + count_tbl = sort(table(unlist(self$result$features)), decreasing = TRUE) + features_selected = names(count_tbl) + features_not_selected = setdiff(private$.features, features_selected) + + res_fs = data.table( + feature = features_selected, + inclusion_probability = as.vector(count_tbl) / nrow(self$result) + ) - count = map_int(features, function(feature) { - sum(map_lgl(self$result$features, function(iteration) { - feature %in% iteration - })) - }) + res_fns = data.table( + feature = features_not_selected, + inclusion_probability = 0 + ) - res = data.table(feature = features, inclusion_probability = count / nrow(self$result)) - setorderv(res, "inclusion_probability", order = -1L) + res = rbindlist(list(res_fs, res_fns)) private$.feature_ranking[[method]] = res private$.feature_ranking[[method]] From f20ddbee6eb71ab53a89ce2e328282b104b43a3d Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 14:31:53 +0200 Subject: [PATCH 32/43] test init from data.table result --- tests/testthat/test_ensemble_fselect.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index c75a778c..380d1d0a 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -24,6 +24,18 @@ test_that("ensemble feature selection works", { expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) }) +test_that("EnsembleFSResult initialization", { + features = LETTERS + result = data.table(a = 1) # not proper column name + expect_error(EnsembleFSResult$new(result = result, features = features)) + + result = data.table(iter = 1:2, learner_id = list("l1", "l2"), + features = list(LETTERS[1], LETTERS[1:3]), + n_features = c(1,3)) + # works without benchmark result object + expect_class(EnsembleFSResult$new(result = result, features = features), "EnsembleFSResult") +}) + test_that("different callbacks can be set", { callback_test = callback_batch_fselect("mlr3fselect.test", From d023a215f39ddad872aef3977358d37955d78f6f Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 14:36:06 +0200 Subject: [PATCH 33/43] refine doc --- R/EnsembleFSResult.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 4865130b..dd834cab 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -3,9 +3,9 @@ #' @name ensemble_fs_result #' #' @description -#' The `EnsembleFSResult` stores the results of the ensemble feature selection -#' and incorporates methods for assessing the stability of the feature selection -#' and ranking the features. +#' The `EnsembleFSResult` class stores the results of ensemble feature selection. +#' It includes methods for evaluating the stability of the feature selection +#' process and for ranking the selected features. #' #' The function [ensemble_fselect()] returns an object of this class. #' @@ -29,6 +29,9 @@ #' #' # returns the stability of the selected features #' efsr$stability(stability_measure = "jaccard") +#' +#' # returns a ranking of all features +#' head(efsr$feature_ranking()) #' } EnsembleFSResult = R6Class("EnsembleFSResult", public = list( @@ -46,12 +49,12 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @param benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result object. + #' Default is `NULL`, but the task's `"features"` must be given. #' @param result ([data.table::data.table])\cr - #' The result of the ensemble feature selection results. + #' The result of the ensemble feature selection. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature - #' selection. Ignored if `benchmark_result` is given and mandatory to have - #' if `benchmark_result` is `NULL`. + #' selection. Ignored if `"benchmark_result"` is given. initialize = function(benchmark_result = NULL, result, features) { if (is.null(benchmark_result)) { assert_character(features, any.missing = FALSE, null.ok = FALSE) @@ -129,7 +132,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", }, #' @description - #' Calculates the stability of the selected features with the `stabm` package. + #' Calculates the stability of the selected features with the \CRANpkg{stabm} package. #' The results are cached. #' When the same stability measure is requested again with different arguments, the cache must be reset. #' From 38b37e0cb438d79d8e421659b23ce9df4b80b0b7 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 7 Jun 2024 15:24:45 +0200 Subject: [PATCH 34/43] update docs --- man/ensemble_fs_result.Rd | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index d78f216b..e395334f 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -5,9 +5,9 @@ \alias{EnsembleFSResult} \title{Ensemble Feature Selection Result} \description{ -The \code{EnsembleFSResult} stores the results of the ensemble feature selection -and incorporates methods for assessing the stability of the feature selection -and ranking the features. +The \code{EnsembleFSResult} class stores the results of ensemble feature selection. +It includes methods for evaluating the stability of the feature selection +process and for ranking the selected features. The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. } @@ -31,6 +31,9 @@ efsr$result # returns the stability of the selected features efsr$stability(stability_measure = "jaccard") + +# returns a ranking of all features +head(efsr$feature_ranking()) } } \section{Public fields}{ @@ -70,17 +73,22 @@ Returns the result of the ensemble feature selection.} \subsection{Method \code{new()}}{ Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result, result)}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result = NULL, result, features)}\if{html}{\out{
    }} } \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ \item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr -The benchmark result object.} +The benchmark result object. +Default is \code{NULL}, but the task's \code{"features"} must be given.} \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr -The result of the ensemble feature selection results.} +The result of the ensemble feature selection.} + +\item{\code{features}}{(\code{\link[=character]{character()}})\cr +The vector of features of the task that was used in the ensemble feature +selection. Ignored if \code{"benchmark_result"} is given.} } \if{html}{\out{
    }} } @@ -147,12 +155,17 @@ Currently, only \code{"inclusion_probability"} is supported.} } \if{html}{\out{}} } +\subsection{Returns}{ +A \link[data.table:data.table]{data.table} listing all the features, +ordered by decreasing inclusion probability scores (depending on the +\code{method}) +} } \if{html}{\out{
    }} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-EnsembleFSResult-stability}{}}} \subsection{Method \code{stability()}}{ -Calculates the stability of the selected features with the \code{stabm} package. +Calculates the stability of the selected features with the \CRANpkg{stabm} package. The results are cached. When the same stability measure is requested again with different arguments, the cache must be reset. \subsection{Usage}{ From 602830567dd9dcc7a0b517115a42c81ffd943ea2 Mon Sep 17 00:00:00 2001 From: be-marc Date: Mon, 10 Jun 2024 16:59:23 +0200 Subject: [PATCH 35/43] refactor: make bmr optional --- R/EnsembleFSResult.R | 47 +++++++++++++------------- R/ensemble_fselect.R | 29 ++++++++-------- man/ensemble_fs_result.Rd | 29 +++++++++++----- man/ensemble_fselect.Rd | 24 ++++++------- tests/testthat/test_ensemble_fselect.R | 37 +++++++++++++++++++- 5 files changed, 104 insertions(+), 62 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index dd834cab..59d87323 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -3,12 +3,18 @@ #' @name ensemble_fs_result #' #' @description -#' The `EnsembleFSResult` class stores the results of ensemble feature selection. -#' It includes methods for evaluating the stability of the feature selection -#' process and for ranking the selected features. -#' +#' The `EnsembleFSResult` stores the results of ensemble feature selection. +#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features. #' The function [ensemble_fselect()] returns an object of this class. #' +#' @section S3 Methods: +#' * `as.data.table.EnsembleFSResult(x, benchmark_result = TRUE)`\cr +#' Returns a tabular view of the ensemble feature selection.\cr +#' [EnsembleFSResult] -> [data.table::data.table()]\cr +#' * `x` ([EnsembleFSResult]) +#' * `benchmark_result` (`logical(1)`)\cr +#' Whether to add the learner, task and resampling information from the benchmark result. +#' #' @examples #' \donttest{ #' efsr = ensemble_fselect( @@ -47,27 +53,20 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. #' - #' @param benchmark_result ([mlr3::BenchmarkResult])\cr - #' The benchmark result object. - #' Default is `NULL`, but the task's `"features"` must be given. #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature #' selection. Ignored if `"benchmark_result"` is given. - initialize = function(benchmark_result = NULL, result, features) { - if (is.null(benchmark_result)) { - assert_character(features, any.missing = FALSE, null.ok = FALSE) - private$.features = features - } else { - self$benchmark_result = assert_benchmark_result(benchmark_result) - private$.features = self$benchmark_result$tasks$task[[1]]$feature_names - } - + #' @param benchmark_result ([mlr3::BenchmarkResult])\cr + #' The benchmark result object. + initialize = function(result, features, benchmark_result = NULL) { assert_data_table(result) assert_names(names(result), must.include = c("iter", "learner_id", "features", "n_features")) - private$.result = result + private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE) + self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result) + self$man = "mlr3fselect::ensemble_fs_result" }, @@ -84,7 +83,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' @param ... (ignored). print = function(...) { catf(format(self)) - print(self$result[, c("learner_id", "n_features"), with = FALSE]) + print(private$.result[, c("iter", "learner_id", "n_features"), with = FALSE]) }, #' @description @@ -111,13 +110,13 @@ EnsembleFSResult = R6Class("EnsembleFSResult", return(private$.feature_ranking[[method]]) } - count_tbl = sort(table(unlist(self$result$features)), decreasing = TRUE) + count_tbl = sort(table(unlist(private$.result$features)), decreasing = TRUE) features_selected = names(count_tbl) features_not_selected = setdiff(private$.features, features_selected) res_fs = data.table( feature = features_selected, - inclusion_probability = as.vector(count_tbl) / nrow(self$result) + inclusion_probability = as.vector(count_tbl) / nrow(private$.result) ) res_fns = data.table( @@ -155,7 +154,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", } fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - private$.stability[[stability_measure]] = fun(self$result$features, ...) + private$.stability[[stability_measure]] = fun(private$.result$features, ...) private$.stability[[stability_measure]] } ), @@ -166,7 +165,9 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Returns the result of the ensemble feature selection. result = function(rhs) { assert_ro_binding(rhs) - private$.result + if (is.null(self$benchmark_result)) return(private$.result) + tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE] + cbind(private$.result, tab) } ), @@ -179,6 +180,6 @@ EnsembleFSResult = R6Class("EnsembleFSResult", ) #' @export -as.data.table.EnsembleFSResult = function(x, ...) { +as.data.table.EnsembleFSResult = function(x, ...) { x$result } diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 0feef2f9..30f0b802 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -4,23 +4,15 @@ #' #' @description #' Ensemble feature selection using multiple learners. -#' The ensemble feature selection method is designed to identify the -#' most informative features from a given dataset by leveraging multiple -#' machine learning models and resampling techniques. +#' The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. #' #' @details -#' The method begins by applying an initial resampling technique specified -#' by the user, to create **multiple subsamples** from the original dataset. -#' This resampling process helps in generating diverse subsets of data for -#' robust feature selection. +#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset. +#' This resampling process helps in generating diverse subsets of data for robust feature selection. #' -#' For each subsample generated in the previous step, the method performs -#' **wrapped-based feature selection** ([auto_fselector]) using each provided -#' learner, the given inner resampling method, performance measure and -#' optimization algorithm. -#' This process generates a best feature subset for each combination of -#' subsample and learner. -#' Results are stored in a [data.table] object. +#' For each subsample generated in the previous step, the method performs **wrapped-based feature selection** ([auto_fselector]) using each provided learner, the given inner resampling method, performance measure and optimization algorithm. +#' This process generates the best feature subset for each combination of subsample and learner. +#' Results are stored in an [EnsembleFSResult]. #' #' @param learners (list of [mlr3::Learner])\cr #' The learners to be used for feature selection. @@ -30,6 +22,8 @@ #' Can only be [mlr_resamplings_subsampling] or [mlr_resamplings_bootstrap]. #' @param inner_resampling ([mlr3::Resampling])\cr #' The inner resampling strategy used by the [FSelector]. +#' @param store_benchmark_result (`logical(1)`)\cr +#' Whether to store the benchmark result in [EnsembleFSResult] or not. #' @param store_models (`logical(1)`)\cr #' Whether to store models in [auto_fselector] or not. #' @param callbacks (list of lists of [CallbackBatchFSelect])\cr @@ -65,6 +59,7 @@ ensemble_fselect = function( measure, terminator, callbacks = NULL, + store_benchmark_result = TRUE, store_models = TRUE ) { assert_task(task) @@ -72,6 +67,7 @@ ensemble_fselect = function( assert_resampling(init_resampling) assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling")) assert_list(callbacks, types = "list", len = length(learners), null.ok = TRUE) + assert_flag(store_benchmark_result) # create auto_fselector for each learner afss = imap(unname(learners), function(learner, i) { @@ -139,5 +135,8 @@ ensemble_fselect = function( set(grid, j = "learner", value = NULL) set(grid, j = "task", value = NULL) set(grid, j = "resampling", value = NULL) - EnsembleFSResult$new(bmr, grid) + EnsembleFSResult$new( + result = grid, + features = task$feature_names, + benchmark_result = if (store_benchmark_result) bmr) } diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index e395334f..ea861eea 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -5,12 +5,24 @@ \alias{EnsembleFSResult} \title{Ensemble Feature Selection Result} \description{ -The \code{EnsembleFSResult} class stores the results of ensemble feature selection. -It includes methods for evaluating the stability of the feature selection -process and for ranking the selected features. - +The \code{EnsembleFSResult} stores the results of ensemble feature selection. +It includes methods for evaluating the stability of the feature selection process and for ranking the selected features. The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. } +\section{S3 Methods}{ + +\itemize{ +\item \code{as.data.table.EnsembleFSResult(x, benchmark_result = TRUE)}\cr +Returns a tabular view of the ensemble feature selection.\cr +\link{EnsembleFSResult} -> \code{\link[data.table:data.table]{data.table::data.table()}}\cr +\itemize{ +\item \code{x} (\link{EnsembleFSResult}) +\item \code{benchmark_result} (\code{logical(1)})\cr +Whether to add the learner, task and resampling information from the benchmark result. +} +} +} + \examples{ \donttest{ efsr = ensemble_fselect( @@ -73,22 +85,21 @@ Returns the result of the ensemble feature selection.} \subsection{Method \code{new()}}{ Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(benchmark_result = NULL, result, features)}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$new(result, features, benchmark_result = NULL)}\if{html}{\out{
    }} } \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ -\item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr -The benchmark result object. -Default is \code{NULL}, but the task's \code{"features"} must be given.} - \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr The result of the ensemble feature selection.} \item{\code{features}}{(\code{\link[=character]{character()}})\cr The vector of features of the task that was used in the ensemble feature selection. Ignored if \code{"benchmark_result"} is given.} + +\item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr +The benchmark result object.} } \if{html}{\out{
    }} } diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 4111b0af..2d166203 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -29,6 +29,7 @@ ensemble_fselect( measure, terminator, callbacks = NULL, + store_benchmark_result = TRUE, store_models = TRUE ) } @@ -60,28 +61,23 @@ Stop criterion of the feature selection.} Callbacks to be used for each learner. The lists must have the same length as the number of learners.} +\item{store_benchmark_result}{(\code{logical(1)})\cr +Whether to store the benchmark result in \link{EnsembleFSResult} or not.} + \item{store_models}{(\code{logical(1)})\cr Whether to store models in \link{auto_fselector} or not.} } \description{ Ensemble feature selection using multiple learners. -The ensemble feature selection method is designed to identify the -most informative features from a given dataset by leveraging multiple -machine learning models and resampling techniques. +The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. } \details{ -The method begins by applying an initial resampling technique specified -by the user, to create \strong{multiple subsamples} from the original dataset. -This resampling process helps in generating diverse subsets of data for -robust feature selection. +The method begins by applying an initial resampling technique specified by the user, to create \strong{multiple subsamples} from the original dataset. +This resampling process helps in generating diverse subsets of data for robust feature selection. -For each subsample generated in the previous step, the method performs -\strong{wrapped-based feature selection} (\link{auto_fselector}) using each provided -learner, the given inner resampling method, performance measure and -optimization algorithm. -This process generates a best feature subset for each combination of -subsample and learner. -Results are stored in a \link{data.table} object. +For each subsample generated in the previous step, the method performs \strong{wrapped-based feature selection} (\link{auto_fselector}) using each provided learner, the given inner resampling method, performance measure and optimization algorithm. +This process generates the best feature subset for each combination of subsample and learner. +Results are stored in an \link{EnsembleFSResult}. } \examples{ \donttest{ diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 380d1d0a..6c61268f 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -1,4 +1,32 @@ test_that("ensemble feature selection works", { + task = tsk("sonar") + efsr = ensemble_fselect( + fselector = fs("random_search"), + task = task, + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 5) + ) + + expect_character(efsr$man) + expect_data_table(efsr$result, nrows = 4) + expect_list(efsr$result$features, any.missing = FALSE, len = 4) + expect_vector(efsr$result$n_features, size = 4) + expect_vector(efsr$result$classif.ce, size = 4) + expect_benchmark_result(efsr$benchmark_result) + + expect_number(efsr$stability(stability_measure = "jaccard")) + feature_ranking = efsr$feature_ranking() + expect_data_table(feature_ranking, nrows = length(task$feature_names)) + expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + + tab = as.data.table(efsr) + tab +}) + +test_that("ensemble feature selection works with rfe", { task = tsk("sonar") efsr = ensemble_fselect( fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), @@ -22,6 +50,9 @@ test_that("ensemble feature selection works", { feature_ranking = efsr$feature_ranking() expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + + tab = as.data.table(efsr) + tab }) test_that("EnsembleFSResult initialization", { @@ -33,7 +64,11 @@ test_that("EnsembleFSResult initialization", { features = list(LETTERS[1], LETTERS[1:3]), n_features = c(1,3)) # works without benchmark result object - expect_class(EnsembleFSResult$new(result = result, features = features), "EnsembleFSResult") + efsr = EnsembleFSResult$new(result = result, features = features) + expect_class(efsr, "EnsembleFSResult") + tab = as.data.table(efsr) + expect_data_table(tab) + expect_names(names(tab), identical.to = c("iter", "learner_id", "features", "n_features")) }) test_that("different callbacks can be set", { From f51500a8a9c727766e04aed05789a821ccfa3208 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 10 Jun 2024 18:06:38 +0200 Subject: [PATCH 36/43] correct 'iter' to 'resampling_id' --- R/EnsembleFSResult.R | 4 ++-- R/ensemble_fselect.R | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 59d87323..064adcad 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -57,12 +57,12 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' The result of the ensemble feature selection. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature - #' selection. Ignored if `"benchmark_result"` is given. + #' selection. #' @param benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result object. initialize = function(result, features, benchmark_result = NULL) { assert_data_table(result) - assert_names(names(result), must.include = c("iter", "learner_id", "features", "n_features")) + assert_names(names(result), must.include = c("resampling_id", "learner_id", "features", "n_features")) private$.result = result private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE) self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 30f0b802..c179ef1a 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -35,6 +35,8 @@ #' @template param_measure #' @template param_terminator #' +#' @returns an [EnsembleFSResult] object. +#' #' @source #' `r format_bib("saeys2008", "abeel2010", "pes2020")` #' @export @@ -90,7 +92,7 @@ ensemble_fselect = function( resampling = rsmp("insample")$instantiate(task_subset) data.table( - iter = i, + resampling_id = i, learner_id = map(learners, "id"), learner = afss, task = list(task_subset), @@ -119,7 +121,6 @@ ensemble_fselect = function( afs$fselect_instance$archive$best()[, measure$id, with = FALSE][[1]] }) - set(grid, j = "iter", value = 1:bmr$n_resample_results) set(grid, j = "features", value = features) set(grid, j = "n_features", value = n_features) set(grid, j = measure$id, value = scores) @@ -138,5 +139,6 @@ ensemble_fselect = function( EnsembleFSResult$new( result = grid, features = task$feature_names, - benchmark_result = if (store_benchmark_result) bmr) + benchmark_result = if (store_benchmark_result) bmr + ) } From 122f2a4c8a81680428ee88b8deb9050974b7f3d3 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 10 Jun 2024 18:14:25 +0200 Subject: [PATCH 37/43] rename baseline feature ranking method to approval voting + add some doc --- R/EnsembleFSResult.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 064adcad..14035764 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -95,15 +95,23 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' @description #' Calculates the feature ranking. #' + #' @details + #' The feature ranking process is built on the following framework: models + #' act as voters, features act as candidates, and voters select certain + #' candidates (features). The primary objective is to compile these selections + #' into a consensus ranked list of features, effectively forming a committee. + #' Currently, only `"approval_voting"` method is supported, which selects the + #' candidates/features that have the highest approval score or selection + #' frequency, i.e. appear the most often. + #' #' @param method (`character(1)`)\cr #' The method to calculate the feature ranking. - #' Currently, only `"inclusion_probability"` is supported. #' #' @return A [data.table][data.table::data.table] listing all the features, #' ordered by decreasing inclusion probability scores (depending on the #' `method`) - feature_ranking = function(method = "inclusion_probability") { - assert_choice(method, choices = "inclusion_probability") + feature_ranking = function(method = "approval_voting") { + assert_choice(method, choices = "approval_voting") # cached results if (!is.null(private$.feature_ranking[[method]])) { From f4cabba54e3cc3321d5a66eb5845e03e2eac0443 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 10 Jun 2024 18:14:48 +0200 Subject: [PATCH 38/43] updocs --- man/ensemble_fs_result.Rd | 17 +++++++++++++---- man/ensemble_fselect.Rd | 3 +++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index ea861eea..2cb41150 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -96,7 +96,7 @@ The result of the ensemble feature selection.} \item{\code{features}}{(\code{\link[=character]{character()}})\cr The vector of features of the task that was used in the ensemble feature -selection. Ignored if \code{"benchmark_result"} is given.} +selection.} \item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr The benchmark result object.} @@ -154,18 +154,27 @@ Opens the corresponding help page referenced by field \verb{$man}. \subsection{Method \code{feature_ranking()}}{ Calculates the feature ranking. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{EnsembleFSResult$feature_ranking(method = "inclusion_probability")}\if{html}{\out{
    }} +\if{html}{\out{
    }}\preformatted{EnsembleFSResult$feature_ranking(method = "approval_voting")}\if{html}{\out{
    }} } \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ \item{\code{method}}{(\code{character(1)})\cr -The method to calculate the feature ranking. -Currently, only \code{"inclusion_probability"} is supported.} +The method to calculate the feature ranking.} } \if{html}{\out{
    }} } +\subsection{Details}{ +The feature ranking process is built on the following framework: models +act as voters, features act as candidates, and voters select certain +candidates (features). The primary objective is to compile these selections +into a consensus ranked list of features, effectively forming a committee. +Currently, only \code{"approval_voting"} method is supported, which selects the +candidates/features that have the highest approval score or selection +frequency, i.e. appear the most often. +} + \subsection{Returns}{ A \link[data.table:data.table]{data.table} listing all the features, ordered by decreasing inclusion probability scores (depending on the diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 2d166203..72dd0ff7 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -67,6 +67,9 @@ Whether to store the benchmark result in \link{EnsembleFSResult} or not.} \item{store_models}{(\code{logical(1)})\cr Whether to store models in \link{auto_fselector} or not.} } +\value{ +an \link{EnsembleFSResult} object. +} \description{ Ensemble feature selection using multiple learners. The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. From 641465292bf21ebdee2da95d808aa323ce3ed81f Mon Sep 17 00:00:00 2001 From: john Date: Mon, 10 Jun 2024 18:24:43 +0200 Subject: [PATCH 39/43] fix test --- tests/testthat/test_ensemble_fselect.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 6c61268f..0c39814c 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -60,7 +60,7 @@ test_that("EnsembleFSResult initialization", { result = data.table(a = 1) # not proper column name expect_error(EnsembleFSResult$new(result = result, features = features)) - result = data.table(iter = 1:2, learner_id = list("l1", "l2"), + result = data.table(resampling_id = 1:2, learner_id = list("l1", "l2"), features = list(LETTERS[1], LETTERS[1:3]), n_features = c(1,3)) # works without benchmark result object @@ -68,7 +68,7 @@ test_that("EnsembleFSResult initialization", { expect_class(efsr, "EnsembleFSResult") tab = as.data.table(efsr) expect_data_table(tab) - expect_names(names(tab), identical.to = c("iter", "learner_id", "features", "n_features")) + expect_names(names(tab), identical.to = c("resampling_id", "learner_id", "features", "n_features")) }) test_that("different callbacks can be set", { From 3628733b229a3628ff66585799c30ba3b5e125b7 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 10 Jun 2024 18:27:56 +0200 Subject: [PATCH 40/43] document result data.table columns --- R/EnsembleFSResult.R | 2 ++ man/ensemble_fs_result.Rd | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 14035764..9c27636f 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -55,6 +55,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection. + #' Column names should include `"resampling_id"`, `"learner_id"`, `"features"` + #' and `"n_features"`. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature #' selection. diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index 2cb41150..c394dda7 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -92,7 +92,9 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \if{html}{\out{
    }} \describe{ \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr -The result of the ensemble feature selection.} +The result of the ensemble feature selection. +Column names should include \code{"resampling_id"}, \code{"learner_id"}, \code{"features"} +and \code{"n_features"}.} \item{\code{features}}{(\code{\link[=character]{character()}})\cr The vector of features of the task that was used in the ensemble feature From bc12f35840a6e5b312bb23c2b65511921905853c Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 11 Jun 2024 11:22:34 +0200 Subject: [PATCH 41/43] feat: per learner stability --- R/EnsembleFSResult.R | 52 ++++++++++++++--------- R/ensemble_fselect.R | 4 +- tests/testthat/test_ensemble_fselect.R | 59 +++++++++++++++++++++++--- 3 files changed, 88 insertions(+), 27 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 9c27636f..e37250f7 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -64,7 +64,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' The benchmark result object. initialize = function(result, features, benchmark_result = NULL) { assert_data_table(result) - assert_names(names(result), must.include = c("resampling_id", "learner_id", "features", "n_features")) + assert_names(names(result), must.include = c("resampling_iteration", "learner_id", "features", "n_features")) private$.result = result private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE) self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result) @@ -85,7 +85,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' @param ... (ignored). print = function(...) { catf(format(self)) - print(private$.result[, c("iter", "learner_id", "n_features"), with = FALSE]) + print(private$.result[, c("resampling_iteration", "learner_id", "n_features"), with = FALSE]) }, #' @description @@ -98,20 +98,14 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Calculates the feature ranking. #' #' @details - #' The feature ranking process is built on the following framework: models - #' act as voters, features act as candidates, and voters select certain - #' candidates (features). The primary objective is to compile these selections - #' into a consensus ranked list of features, effectively forming a committee. - #' Currently, only `"approval_voting"` method is supported, which selects the - #' candidates/features that have the highest approval score or selection - #' frequency, i.e. appear the most often. + #' The feature ranking process is built on the following framework: models act as voters, features act as candidates, and voters select certain candidates (features). + #' The primary objective is to compile these selections into a consensus ranked list of features, effectively forming a committee. + #' Currently, only `"approval_voting"` method is supported, which selects the candidates/features that have the highest approval score or selection frequency, i.e. appear the most often. #' #' @param method (`character(1)`)\cr #' The method to calculate the feature ranking. #' - #' @return A [data.table][data.table::data.table] listing all the features, - #' ordered by decreasing inclusion probability scores (depending on the - #' `method`) + #' @return A [data.table::data.table] listing all the features, ordered by decreasing inclusion probability scores (depending on the `method`) feature_ranking = function(method = "approval_voting") { assert_choice(method, choices = "approval_voting") @@ -151,21 +145,38 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Default is `"jaccard"`. #' @param ... (`any`)\cr #' Additional arguments passed to the stability measure function. + #' @param global (`logical(1)`)\cr + #' Whether to calculate the stability globally or for each learner. #' @param reset_cache (`logical(1)`)\cr #' If `TRUE`, the cached results are ignored. - stability = function(stability_measure = "jaccard", ..., reset_cache = FALSE) { + stability = function(stability_measure = "jaccard", ..., global = TRUE, reset_cache = FALSE) { funs = stabm::listStabilityMeasures()$Name keys = tolower(gsub("stability", "", funs)) assert_choice(stability_measure, choices = keys) - # cached results - if (!is.null(private$.stability[[stability_measure]]) && !reset_cache) { - return(private$.stability[[stability_measure]]) + if (global) { + # cached results + if (!is.null(private$.stability_global[[stability_measure]]) && !reset_cache) { + return(private$.stability_global[[stability_measure]]) + } + + fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) + private$.stability_global[[stability_measure]] = fun(private$.result$features, ...) + private$.stability_global[[stability_measure]] + } else { + # cached results + if (!is.null(private$.stability_learner[[stability_measure]]) && !reset_cache) { + return(private$.stability_learner[[stability_measure]]) + } + + fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) + + tab = private$.result[, list(score = fun(.SD$features, ...)), by = learner_id] + private$.stability_learner[[stability_measure]] = set_names(tab$score, tab$learner_id) + private$.stability_learner[[stability_measure]] } - fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - private$.stability[[stability_measure]] = fun(private$.result$features, ...) - private$.stability[[stability_measure]] + } ), @@ -183,7 +194,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult", private = list( .result = NULL, - .stability = NULL, + .stability_global = NULL, + .stability_learner = NULL, .feature_ranking = NULL, .features = NULL ) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index c179ef1a..87942adf 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -92,8 +92,8 @@ ensemble_fselect = function( resampling = rsmp("insample")$instantiate(task_subset) data.table( - resampling_id = i, - learner_id = map(learners, "id"), + resampling_iteration = i, + learner_id = map_chr(learners, "id"), learner = afss, task = list(task_subset), resampling = list(resampling) diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index 0c39814c..d6a03a0a 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -17,13 +17,56 @@ test_that("ensemble feature selection works", { expect_vector(efsr$result$classif.ce, size = 4) expect_benchmark_result(efsr$benchmark_result) + # stability expect_number(efsr$stability(stability_measure = "jaccard")) + stability = efsr$stability(stability_measure = "jaccard", global = FALSE) + expect_numeric(stability, len = 2) + expect_names(names(stability), identical.to = c("classif.rpart", "classif.featureless")) + + # feature ranking + feature_ranking = efsr$feature_ranking() + expect_data_table(feature_ranking, nrows = length(task$feature_names)) + expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + + # data.table conversion + tab = as.data.table(efsr) + expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce", "task", "learner", "resampling")) +}) + +test_that("ensemble feature selection works without benchmark result", { + task = tsk("sonar") + efsr = ensemble_fselect( + fselector = fs("random_search"), + task = task, + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 5), + store_benchmark_result = FALSE + ) + + expect_character(efsr$man) + expect_data_table(efsr$result, nrows = 4) + expect_list(efsr$result$features, any.missing = FALSE, len = 4) + expect_vector(efsr$result$n_features, size = 4) + expect_vector(efsr$result$classif.ce, size = 4) + expect_null(efsr$benchmark_result) + + # stability + expect_number(efsr$stability(stability_measure = "jaccard")) + stability = efsr$stability(stability_measure = "jaccard", global = FALSE) + expect_numeric(stability, len = 2) + expect_names(names(stability), identical.to = c("classif.rpart", "classif.featureless")) + + # feature ranking feature_ranking = efsr$feature_ranking() expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + # data.table conversion tab = as.data.table(efsr) - tab + expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce")) }) test_that("ensemble feature selection works with rfe", { @@ -46,13 +89,20 @@ test_that("ensemble feature selection works with rfe", { expect_list(efsr$result$importance, any.missing = FALSE, len = 4) expect_benchmark_result(efsr$benchmark_result) + # stability expect_number(efsr$stability(stability_measure = "jaccard")) + stability = efsr$stability(stability_measure = "jaccard", global = FALSE) + expect_numeric(stability, len = 2) + expect_names(names(stability), identical.to = c("classif.rpart", "classif.featureless")) + + # feature ranking feature_ranking = efsr$feature_ranking() expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + # data.table conversion tab = as.data.table(efsr) - tab + expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce", "importance", "task", "learner", "resampling")) }) test_that("EnsembleFSResult initialization", { @@ -60,7 +110,7 @@ test_that("EnsembleFSResult initialization", { result = data.table(a = 1) # not proper column name expect_error(EnsembleFSResult$new(result = result, features = features)) - result = data.table(resampling_id = 1:2, learner_id = list("l1", "l2"), + result = data.table(resampling_iteration = 1:2, learner_id = list("l1", "l2"), features = list(LETTERS[1], LETTERS[1:3]), n_features = c(1,3)) # works without benchmark result object @@ -68,11 +118,10 @@ test_that("EnsembleFSResult initialization", { expect_class(efsr, "EnsembleFSResult") tab = as.data.table(efsr) expect_data_table(tab) - expect_names(names(tab), identical.to = c("resampling_id", "learner_id", "features", "n_features")) + expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features")) }) test_that("different callbacks can be set", { - callback_test = callback_batch_fselect("mlr3fselect.test", on_eval_before_archive = function(callback, context) { context$aggregated_performance[, callback_active := context$instance$objective$learner$id == "classif.rpart"] From 7ac4b6ca0b3f02a781d24deb85ab7b22d6a6751f Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 11 Jun 2024 11:27:40 +0200 Subject: [PATCH 42/43] docs: update --- R/EnsembleFSResult.R | 39 ++++++++++++++------------- R/ensemble_fselect.R | 4 ++- man/ensemble_fs_result.Rd | 56 ++++++++++++++++++++------------------- man/ensemble_fselect.Rd | 4 ++- 4 files changed, 55 insertions(+), 48 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index e37250f7..54ad93d5 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -17,27 +17,27 @@ #' #' @examples #' \donttest{ -#' efsr = ensemble_fselect( -#' fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), -#' task = tsk("sonar"), -#' learners = lrns(c("classif.rpart", "classif.featureless")), -#' init_resampling = rsmp("subsampling", repeats = 2), -#' inner_resampling = rsmp("cv", folds = 3), -#' measure = msr("classif.ce"), -#' terminator = trm("none") -#' ) +#' efsr = ensemble_fselect( +#' fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), +#' task = tsk("sonar"), +#' learners = lrns(c("classif.rpart", "classif.featureless")), +#' init_resampling = rsmp("subsampling", repeats = 2), +#' inner_resampling = rsmp("cv", folds = 3), +#' measure = msr("classif.ce"), +#' terminator = trm("none") +#' ) #' -#' # contains the benchmark result -#' efsr$benchmark_result +#' # contains the benchmark result +#' efsr$benchmark_result #' -#' # contains the selected features for each iteration -#' efsr$result +#' # contains the selected features for each iteration +#' efsr$result #' -#' # returns the stability of the selected features -#' efsr$stability(stability_measure = "jaccard") +#' # returns the stability of the selected features +#' efsr$stability(stability_measure = "jaccard") #' -#' # returns a ranking of all features -#' head(efsr$feature_ranking()) +#' # returns a ranking of all features +#' head(efsr$feature_ranking()) #' } EnsembleFSResult = R6Class("EnsembleFSResult", public = list( @@ -149,6 +149,9 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' Whether to calculate the stability globally or for each learner. #' @param reset_cache (`logical(1)`)\cr #' If `TRUE`, the cached results are ignored. + #' + #' @return A `numeric()` value representing the stability of the selected features. + #' Or a `numeric()` vector with the stability of the selected features for each learner. stability = function(stability_measure = "jaccard", ..., global = TRUE, reset_cache = FALSE) { funs = stabm::listStabilityMeasures()$Name keys = tolower(gsub("stability", "", funs)) @@ -175,8 +178,6 @@ EnsembleFSResult = R6Class("EnsembleFSResult", private$.stability_learner[[stability_measure]] = set_names(tab$score, tab$learner_id) private$.stability_learner[[stability_measure]] } - - } ), diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index 87942adf..afde7803 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -5,6 +5,7 @@ #' @description #' Ensemble feature selection using multiple learners. #' The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. +#' Returns an [EnsembleFSResult]. #' #' @details #' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset. @@ -42,7 +43,7 @@ #' @export #' @examples #' \donttest{ -#' ensemble_fselect( +#' efsr = ensemble_fselect( #' fselector = fs("random_search"), #' task = tsk("sonar"), #' learners = lrns(c("classif.rpart", "classif.featureless")), @@ -51,6 +52,7 @@ #' measure = msr("classif.ce"), #' terminator = trm("evals", n_evals = 10) #' ) +#' efsr #' } ensemble_fselect = function( fselector, diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index c394dda7..d5720cfb 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -25,27 +25,27 @@ Whether to add the learner, task and resampling information from the benchmark r \examples{ \donttest{ -efsr = ensemble_fselect( - fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), - task = tsk("sonar"), - learners = lrns(c("classif.rpart", "classif.featureless")), - init_resampling = rsmp("subsampling", repeats = 2), - inner_resampling = rsmp("cv", folds = 3), - measure = msr("classif.ce"), - terminator = trm("none") -) + efsr = ensemble_fselect( + fselector = fs("rfe", n_features = 2, feature_fraction = 0.8), + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 2), + inner_resampling = rsmp("cv", folds = 3), + measure = msr("classif.ce"), + terminator = trm("none") + ) -# contains the benchmark result -efsr$benchmark_result + # contains the benchmark result + efsr$benchmark_result -# contains the selected features for each iteration -efsr$result + # contains the selected features for each iteration + efsr$result -# returns the stability of the selected features -efsr$stability(stability_measure = "jaccard") + # returns the stability of the selected features + efsr$stability(stability_measure = "jaccard") -# returns a ranking of all features -head(efsr$feature_ranking()) + # returns a ranking of all features + head(efsr$feature_ranking()) } } \section{Public fields}{ @@ -168,19 +168,13 @@ The method to calculate the feature ranking.} \if{html}{\out{
    }} } \subsection{Details}{ -The feature ranking process is built on the following framework: models -act as voters, features act as candidates, and voters select certain -candidates (features). The primary objective is to compile these selections -into a consensus ranked list of features, effectively forming a committee. -Currently, only \code{"approval_voting"} method is supported, which selects the -candidates/features that have the highest approval score or selection -frequency, i.e. appear the most often. +The feature ranking process is built on the following framework: models act as voters, features act as candidates, and voters select certain candidates (features). +The primary objective is to compile these selections into a consensus ranked list of features, effectively forming a committee. +Currently, only \code{"approval_voting"} method is supported, which selects the candidates/features that have the highest approval score or selection frequency, i.e. appear the most often. } \subsection{Returns}{ -A \link[data.table:data.table]{data.table} listing all the features, -ordered by decreasing inclusion probability scores (depending on the -\code{method}) +A \link[data.table:data.table]{data.table::data.table} listing all the features, ordered by decreasing inclusion probability scores (depending on the \code{method}) } } \if{html}{\out{
    }} @@ -194,6 +188,7 @@ When the same stability measure is requested again with different arguments, the \if{html}{\out{
    }}\preformatted{EnsembleFSResult$stability( stability_measure = "jaccard", ..., + global = TRUE, reset_cache = FALSE )}\if{html}{\out{
    }} } @@ -209,11 +204,18 @@ Default is \code{"jaccard"}.} \item{\code{...}}{(\code{any})\cr Additional arguments passed to the stability measure function.} +\item{\code{global}}{(\code{logical(1)})\cr +Whether to calculate the stability globally or for each learner.} + \item{\code{reset_cache}}{(\code{logical(1)})\cr If \code{TRUE}, the cached results are ignored.} } \if{html}{\out{}} } +\subsection{Returns}{ +A \code{numeric()} value representing the stability of the selected features. +Or a \code{numeric()} vector with the stability of the selected features for each learner. +} } \if{html}{\out{
    }} \if{html}{\out{}} diff --git a/man/ensemble_fselect.Rd b/man/ensemble_fselect.Rd index 72dd0ff7..6a8454f1 100644 --- a/man/ensemble_fselect.Rd +++ b/man/ensemble_fselect.Rd @@ -73,6 +73,7 @@ an \link{EnsembleFSResult} object. \description{ Ensemble feature selection using multiple learners. The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. +Returns an \link{EnsembleFSResult}. } \details{ The method begins by applying an initial resampling technique specified by the user, to create \strong{multiple subsamples} from the original dataset. @@ -84,7 +85,7 @@ Results are stored in an \link{EnsembleFSResult}. } \examples{ \donttest{ - ensemble_fselect( + efsr = ensemble_fselect( fselector = fs("random_search"), task = tsk("sonar"), learners = lrns(c("classif.rpart", "classif.featureless")), @@ -93,5 +94,6 @@ Results are stored in an \link{EnsembleFSResult}. measure = msr("classif.ce"), terminator = trm("evals", n_evals = 10) ) + efsr } } From da3762af27b1b4a30c7bcd72334d0b93ed503dc8 Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 11 Jun 2024 11:34:26 +0200 Subject: [PATCH 43/43] chore: news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 69450d1c..44852289 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # mlr3fselect (development version) +* feat: Add ensemble feature selection function `ensemble_fselect()`. + # mlr3fselect 0.12.0 * feat: Add number of features to `instance$result`.