From c7aacceb7ab91636b032a28496379874b010a0d3 Mon Sep 17 00:00:00 2001 From: Oleg Sofrygin Date: Sat, 28 Jan 2017 19:14:53 -0800 Subject: [PATCH] adding docs for xgb.grid and clean-up R checks --- NAMESPACE | 9 +- R/ModelGLMsClass.R | 6 +- R/ModelH2OGridLearner.R | 51 +-- R/ModelH2OOutOfSamplePrediction.R | 4 +- R/ModelPredictionStack.R | 8 +- R/ModelXGBoostClass.R | 2 +- R/ModelXGBoostGridLearner.R | 54 ++- R/ModelXGBoostOutOfSamplePrediction.R | 34 +- R/Modelbrokenstick.R | 41 +- R/Modelface.R | 4 +- R/fit_main.R | 7 +- R/summaryS3methods.R | 235 +++-------- R/utility_funs.R | 40 +- R/wrappers.R | 393 ------------------ inst/report/report-script-rmd.R | 2 +- man/XGBoostClass.Rd | 2 +- man/fit.ModelStack.Rd | 2 - man/glmModelClass.Rd | 2 +- man/pander.H2OBinomialMetrics.Rd | 2 + man/pander.H2ORegressionMetrics.Rd | 2 + man/plotMSEs.Rd | 6 +- man/print.GLMmodel.Rd | 4 +- man/print.H2Oensemblemodel.Rd | 6 +- man/print.ModelStack.Rd | 4 +- man/print.PredictionStack.Rd | 9 +- man/print.brokenstickmodel.Rd | 4 +- man/save_best_model.Rd | 2 + man/summary.GLMmodel.Rd | 4 +- man/summary.H2ORegressionModel.Rd | 6 +- man/summary.H2Oensemblemodel.Rd | 4 +- man/summary.brokenstickmodel.Rd | 4 +- man/summary.xgb.Booster.Rd | 6 +- man/xgb.grid.Rd | 82 ++++ .../RUnit/RUnit_tests_02b_SLexample_xgboost.R | 6 +- 34 files changed, 309 insertions(+), 738 deletions(-) delete mode 100644 R/wrappers.R create mode 100644 man/xgb.grid.Rd diff --git a/NAMESPACE b/NAMESPACE index b8f0457..59538f5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ S3method("+",ModelStack) S3method(fit,ModelStack) S3method(fit,brokenstick) S3method(fit,face) -S3method(fit,h2ogrid) S3method(fit,xgb.grid) S3method(newFitModel,brokenstick) S3method(newFitModel,face) @@ -51,18 +50,12 @@ export(eval_MSE) export(faceModelClass) export(fit) export(fit_model) -export(fit_single_h2o_grid) export(fit_single_xgboost_grid) export(get_out_of_sample_predictions) export(get_train_data) export(get_validation_data) export(glmModelClass) export(gridislOptions) -export(h2o.deeplearning.wrapper) -export(h2o.example.wrapper) -export(h2o.gbm.wrapper) -export(h2o.glm.wrapper) -export(h2o.randomForest.wrapper) export(make_PredictionStack) export(make_model_report) export(newFitModel) @@ -77,6 +70,8 @@ export(set_all_gridisl_options) export(xgb.grid) import(R6) import(data.table) +import(ggiraph) +import(ggplot2) importFrom(Rcpp,sourceCpp) importFrom(assertthat,assert_that) importFrom(assertthat,is.count) diff --git a/R/ModelGLMsClass.R b/R/ModelGLMsClass.R index d314db9..700572d 100644 --- a/R/ModelGLMsClass.R +++ b/R/ModelGLMsClass.R @@ -19,7 +19,7 @@ fit.glm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) { SuppressGivenWarnings({ model.fit <- stats::glm.fit(x = Xmat, y = Yvals, - family = quasibinomial() , + family = stats::quasibinomial() , control = ctrl) }, GetWarningsToSuppress()) } @@ -54,7 +54,7 @@ fit.speedglm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) { model.fit <- try(speedglm::speedglm.wfit(X = Xmat, y = Yvals, method = 'Cholesky', - family = quasibinomial(), + family = stats::quasibinomial(), trace = FALSE), silent = TRUE) }, GetWarningsToSuppress()) @@ -121,7 +121,7 @@ predictP1.GLMmodel <- function(m.fit, ParentObject, DataStorageObject, subset_id #' } #' @section Methods: #' \describe{ -#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a +#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a #' design matrix and binary outcome.} #' \item{\code{setdata()}}{...} #' } diff --git a/R/ModelH2OGridLearner.R b/R/ModelH2OGridLearner.R index e5953b2..dae7450 100644 --- a/R/ModelH2OGridLearner.R +++ b/R/ModelH2OGridLearner.R @@ -1,49 +1,5 @@ -# # default metalearner (NN least squares) -# h2o.glm_nn <- function(..., non_negative = TRUE) h2o.glm.wrapper(..., non_negative = non_negative) - -# # Train a model using a single h2o learner (user-spec) with cross-validation & keep CV predictions -# # @export -# fit_single_h2o_learner <- function(learner, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame = NULL, ...) { -# if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress() -# learner_fun <- match.fun(learner) - -# mainArgs <- list(y = y, training_frame = training_frame, family = family, -# keep_cross_validation_predictions = TRUE, -# keep_cross_validation_fold_assignment = TRUE) - -# if (!missing(fold_column)) { -# if (!is.null(fold_column) && is.character(fold_column) && (fold_column != "")) { -# mainArgs$fold_column <- fold_column -# } -# } - -# mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = learner_fun) - -# if (!is.null(validation_frame)) mainArgs$validation_frame <- validation_frame - -# if (("x" %in% names(formals(learner))) && (as.character(formals(learner)$x)[1] != "")) { -# # Special case where we pass a subset of the colnames, x, in a custom learner function wrapper -# # model_fit <- learner_fun(y = y, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE) -# } else { -# # Use all predictors in training set for training -# mainArgs$x <- x -# # model_fit <- learner_fun(y = y, x = x, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE) -# } - -# model_fit <- do.call(learner_fun, mainArgs) - -# fit <- vector(mode = "list") -# fit$fitfunname <- learner; -# if (gvars$verbose) { -# print("grid search fitted models:"); print(model_fit) -# } -# fit$model_fit <- model_fit -# class(fit) <- c(class(fit)[1], c("H2Omodel")) -# return(fit) -# } - - -#' @export + + fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame = NULL, ...) { if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress() mainArgs <- list(x = x, y = y, training_frame = training_frame, @@ -81,7 +37,7 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = " ## doesn't work if h2o namespace is not loaded: # algo_fun <- get0(algo_fun_name, mode = "function", inherits = TRUE) - algo_fun <- getFromNamespace(algo_fun_name, ns='h2o') + algo_fun <- utils::getFromNamespace(algo_fun_name, ns='h2o') mainArgs <- keep_only_fun_args(mainArgs, fun = algo_fun) # Keep only the relevant args in mainArgs list: mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = algo_fun) # Add user args that pertain to this specific learner: @@ -120,7 +76,6 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = " } -#' @export fit.h2ogrid <- function(fit.class, params, training_frame, y, x, model_contrl, fold_column, ...) { family <- model_contrl[["family"]] if (is.null(family)) family <- "binomial" diff --git a/R/ModelH2OOutOfSamplePrediction.R b/R/ModelH2OOutOfSamplePrediction.R index 5520cf1..628781c 100644 --- a/R/ModelH2OOutOfSamplePrediction.R +++ b/R/ModelH2OOutOfSamplePrediction.R @@ -1,5 +1,5 @@ check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) { - all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment) + all_folds_h2o <- lapply(models_list, function(x) h2o::h2o.cross_validation_fold_assignment(x)) train_frame_ID_1 <- models_list[[1]]@parameters$training_frame if (length(all_folds_h2o) > 1) { @@ -128,7 +128,7 @@ predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subse if (ncol(newpreds) > 1) newpreds <- newpreds[["p1"]] newpreds_prev_CV_i <- h2o::h2o.cbind(newpreds_prev_CV_i, newpreds) } - newpreds_prev_CV_i <- h2o::h2o.cbind(h2o.which(fold_CV_i_logical), newpreds_prev_CV_i) + newpreds_prev_CV_i <- h2o::h2o.cbind(h2o::h2o.which(fold_CV_i_logical), newpreds_prev_CV_i) pAoutMat_h2o <- h2o::h2o.rbind(pAoutMat_h2o, newpreds_prev_CV_i) } diff --git a/R/ModelPredictionStack.R b/R/ModelPredictionStack.R index 9d26135..c748140 100644 --- a/R/ModelPredictionStack.R +++ b/R/ModelPredictionStack.R @@ -1,13 +1,11 @@ #' S3 methods for printing model fit summary for PredictionModel R6 class object #' #' Prints the modeling summaries -#' @param modelstack The model fit object produced by functions \code{make_PredictionStack}. -#' @param model_stats Also print some model summaries? -#' @param all_fits Print all of the modeling fits contained in this object? Warning: this may produce a lot of output! +#' @param x The model fit object produced by functions \code{make_PredictionStack}. #' @param ... Additional options passed on to \code{print.PredictionModel}. #' @export -print.PredictionStack <- function(modelstack, model_stats = FALSE, all_fits = FALSE, ...) { - modelstack$show(model_stats = model_stats, all_fits = all_fits) +print.PredictionStack <- function(x, ...) { + x$show(...) return(invisible(NULL)) } diff --git a/R/ModelXGBoostClass.R b/R/ModelXGBoostClass.R index a3f0ec6..2b0b17f 100644 --- a/R/ModelXGBoostClass.R +++ b/R/ModelXGBoostClass.R @@ -349,7 +349,7 @@ predictP1.XGBoostmodel <- function(m.fit, ParentObject, DataStorageObject, subse #' } #' @section Methods: #' \describe{ -#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a +#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a #' design matrix and binary outcome.} #' \item{\code{setdata()}}{...} #' } diff --git a/R/ModelXGBoostGridLearner.R b/R/ModelXGBoostGridLearner.R index dc905c8..e0ad666 100644 --- a/R/ModelXGBoostGridLearner.R +++ b/R/ModelXGBoostGridLearner.R @@ -1,6 +1,12 @@ + + +if(getRversion() >= "2.15.1") { + utils::globalVariables(c("xgb_fit", "niter")) +} + ## custom MSE error evaluation function. Averages the subject level MSE first, then averages across subjects evalMSEerror_byID <- function(preds, data) { - labels <- getinfo(data, "label") + labels <- xgboost::getinfo(data, "label") # The usual RMSE (based on rows in the dataset) # err = sqrt(as.numeric(sum((labels - preds)^2))/length(labels)) # RMSE based on the average MSE across subjects first @@ -15,8 +21,47 @@ evalMSEerror_byID <- function(preds, data) { return(list(metric = "RMSE", value = err)) } -## Peforming simple hyperparameter grid search for xgboost with cross-validation -## Relies on tidyverse syntax and borrowed from: https://drsimonj.svbtle.com/grid-search-in-the-tidyverse +#' Hyper-parameter grid search for xgboost +#' +#' Performing simple hyper-parameter grid search for xgboost. Model scoring can be +#' done either with validation data or with V-fold cross-validation. +#' @param param_grid A named list with xgboost parameter names, consisting of vectors of hyper-parameter values. +#' The dataset containing the grid of possible hyper-parameters for model +#' training is formed internally by running \code{purrr::cross_d(param_grid)}. +#' @param data Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param nrounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param nfold Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param label Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param missing Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param prediction Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param showsd Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param metrics Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param obj Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param feval Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param stratified Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param folds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param verbose Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param early_stopping_rounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param maximize Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param callbacks Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @param search_criteria Define how to search over the grid of hyper-parameters. +#' This should be the list with parameters controlling the grid search. +#' Currently supported parameters are: \code{'strategy'} and \code{'max_models'}. +#' Currently supported values for \code{strategy} are \code{'Cartesian'} (covers the entire space of hyper-parameter combinations) or +#' \code{'RandomDiscrete'} (do a random search of all the combinations of hyper-parameters). +#' \code{'max_models'} parameter can be set to an integer >0 that defines the maximum number of models to be trained. +#' @param seed Specify the seed to use for determining the random model order in random grid search. +#' @param order_metric_name What is the name of the metric for ranking the final grid of model fits? +#' @param validation_data Validation data to score the model performance while training with \code{xgboost::xgb.train}. +#' Must be in the same format as \code{data}, see \code{?xgboost::xgb.train} for additional information. +#' @param ... Other parameters passed on directly to either \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}. +#' @return A resulting grid search of model object fits in a form of a \code{data.table} with \code{xgboost} model fit +#' objects saved in a list column named \code{'xgb_fit'}. +#' In addition, the output \code{data.table} contains the original hyper-parameters used as well as the model +#' performance metrics assessed by \code{xgboost}. The dataset is sorted according to the \code{order_metric_name}. +#' @author The code for using \code{tidyverse} syntax for model grid search is borrowed and adapted from: +#' \href{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}. +#' The \code{search_criteria} idea is borrowed from \code{h2o::h2o.grid}. #' @export # print_every_n = 1L, xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = NA, @@ -132,13 +177,12 @@ xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = N ## ------------------------------------------------------------ ## Sequentially fit each model from the grid ## ------------------------------------------------------------ + gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model)) # data.table::setDT(gs) # for (i in 1:nrow(gs)) { # gs[i, xgb_fit := list(list(purrr::lift(run_singe_model)(gs[i, ])))] # } - gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model)) - ## ------------------------------------------------------------ ## TO RUN GRID MODELS IN PARALLEL ## ------------------------------------------------------------ diff --git a/R/ModelXGBoostOutOfSamplePrediction.R b/R/ModelXGBoostOutOfSamplePrediction.R index 90bf970..9f0666d 100644 --- a/R/ModelXGBoostOutOfSamplePrediction.R +++ b/R/ModelXGBoostOutOfSamplePrediction.R @@ -1,35 +1,3 @@ -# check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) { -# all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment) -# train_frame_ID_1 <- models_list[[1]]@parameters$training_frame - -# if (length(all_folds_h2o) > 1) { -# ## 1. Test that the exactly the same fold assignments were used by all CV models in the ensemble. -# for (idx in 2:length(all_folds_h2o) ) { -# if (!h2o::h2o.all(all_folds_h2o[[1]]==all_folds_h2o[[idx]]) ) -# stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignmets of the following CV model do not match to others: " %+% names(models_list)[idx]) -# } - -# ## 2. Test that same training h2oFrame was used for all models in the ensemble (just in case). -# for (idx in 2:length(all_folds_h2o) ) { -# if (!all.equal(train_frame_ID_1, models_list[[idx]]@parameters$training_frame)) -# stop("Out-of-sample (holdout) predictions for new data has failed. It appears that some of the CV models in ensemble used different training frames.") -# } -# } - -# ## 3. Test that the validation and training data have exactly the same fold assignments (in h2oFrame) -# if (!all(valid_H2Oframe[[fold_column]] == all_folds_h2o[[1]])) -# stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignments in new data (validation_data) and training data appear to be different.") - -# ## 4a. Test that the new validation data (in h2oFrame) has the same number of observations as the training data -# if (!(nrow(valid_H2Oframe) == nrow(h2o::h2o.getFrame(train_frame_ID_1)))) -# stop("Out-of-sample (holdout) predictions for new data has failed. The number of rows in new data (validation_data) does not match that of the training data.") - -# ## 4b. Test that all predictors are present in the validation data (in h2oFrame) -# if (!all(c(predvars,fold_column) %in% colnames(valid_H2Oframe))) -# stop("Out-of-sample (holdout) predictions for new data has failed. Some of the predictors were not found in new data (validation_data).") -# return(invisible(TRUE)) -# } - ## ---------------------------------------------------------------------------------------------------------------------------------- ## Evaluate out-of-sample predictions from V cross-validation models, based on new validation_data. ## Can be useful for re-scoring the models when the validation data has to change from the training data in V-fold cross-validation. @@ -37,7 +5,7 @@ ## (i.e., predictions in validation_data will be only made for rows that were not used for training the model V_i) ## In the end we generate a vector of n=nrow(validation_data) predictions by combining predictions from all models V=(V_1,...,V_v) ## This procedure is repeated for each cross-validated model in the ensemble, resulting in a matrix of predictions (n,k), -## where k is the total number of models trained by this ensemble (with h2o.grid, etc) and is equal to length(models_list) +## where k is the total number of models trained by this ensemble (with xgb.grid, etc) and is equal to length(models_list) ## ---------------------------------------------------------------------------------------------------------------------------------- xgb_predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subset_idx, predict_model_names, ...) { models_list <- m.fit$modelfits_all diff --git a/R/Modelbrokenstick.R b/R/Modelbrokenstick.R index 08e59bd..b921a43 100644 --- a/R/Modelbrokenstick.R +++ b/R/Modelbrokenstick.R @@ -1,44 +1,51 @@ + + +if(getRversion() >= "2.15.1") { + utils::globalVariables(c("yhat", "y", "x", "subject")) +} + + #' S3 methods for printing model fit summary for brokenstickmodel class object #' #' Prints the modeling summary for the glm fit (\code{stats::glm.fit} or \code{speedglm::speedglm.wfit}) -#' @param model.fit The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm +#' @param x The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm #' @param ... Additional options passed on to \code{summary.GLMmodel}. #' @return The output is printed with \code{cat}. To capture the markdown-formated model summary use \code{summary.GLMmodel}. #' @export -print.brokenstickmodel <- function(model.fit, ...) { - cat(paste(summary(model.fit, ...), collapse = '\n')) +print.brokenstickmodel <- function(x, ...) { + cat(paste(summary(x, ...), collapse = '\n')) } #' S3 methods for getting model fit summary for glmfit class object #' #' Prints the modeling summary for the GLM fit (\code{stats::glm.fit} or \code{speedglm::speedglm.wfit}) -#' @param model.fit The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm +#' @param object The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm #' @param format_table Format the coefficients into a data.frame table? #' @param ... Additional options (not used) #' @return The markdown-formated model summary returned by \code{pander::pander_return}. #' @export -summary.brokenstickmodel <- function(model.fit, format_table = TRUE, ...) { - makeModelCaption <- function(model.fit) { +summary.brokenstickmodel <- function(object, format_table = TRUE, ...) { + makeModelCaption <- function(object) { return( - "Model: " %+% model.fit$params$outvar %+% " ~ " %+% paste0(model.fit$params$predvars, collapse = " + ") %+% "; \\ - Stratify: " %+% model.fit$params$stratify %+% "; \\ - N: " %+% prettyNum(model.fit$nobs, big.mark = ",", scientific = FALSE) %+% "; \\ - Fit function: " %+% model.fit$fitfunname + "Model: " %+% object$params$outvar %+% " ~ " %+% paste0(object$params$predvars, collapse = " + ") %+% "; \\ + Stratify: " %+% object$params$stratify %+% "; \\ + N: " %+% prettyNum(object$nobs, big.mark = ",", scientific = FALSE) %+% "; \\ + Fit function: " %+% object$fitfunname ) } - nobs <- model.fit$nobs - coef_out <- model.fit$coef + nobs <- object$nobs + coef_out <- object$coef if (format_table) { if (is.null(coef_out)) { coef_out <- "---"; names(coef_out) <- coef_out } coef_out <- data.frame(Terms = names(coef_out), Coefficients = as.vector(coef_out)) - # coef_out <- data.frame(Terms = model.fit$params$predvars, Coefficients = as.vector(coef_out)) + # coef_out <- data.frame(Terms = object$params$predvars, Coefficients = as.vector(coef_out)) rownames(coef_out) <- NULL } - pander::set.caption(makeModelCaption(model.fit)) - # S4 class: model.fit$model.object - m.summary <- capture.output(print(model.fit$model.object)) + pander::set.caption(makeModelCaption(object)) + # S4 class: object$model.object + m.summary <- utils::capture.output(print(object$model.object)) out <- c(pander::pander_return(coef_out, justify = c('right', 'left')), m.summary) out } @@ -120,7 +127,7 @@ predictP1.brokenstickmodel <- function(m.fit, ParentObject, DataStorageObject, s assert_that(new_vals_idx[1] > length(fitted.Yvals)) setkeyv(new.dat, cols = "subject") - bs.predict <- getFromNamespace("predict.brokenstick", "brokenstick") + bs.predict <- utils::getFromNamespace("predict.brokenstick", "brokenstick") new.dat[, yhat := bs.predict(model.object, y = y, x = x, output = "vector"), by = subject] new.preds <- new.dat[new_vals_ind == TRUE, yhat] pAout[subset_idx] <- as.vector(new.preds) diff --git a/R/Modelface.R b/R/Modelface.R index 52711fb..302d1de 100644 --- a/R/Modelface.R +++ b/R/Modelface.R @@ -3,8 +3,6 @@ fit.face <- function(fit.class, params, subj, argvals, Yvals, knots = NULL, mode if (gvars$verbose) print("calling face::face.sparse...") if (length(subj) == 0L) { model.fit <- list() - model.fit$coef = rep.int(NA_real_, ncol(Xmat)) - names(model.fit$coef) <- colnames(Xmat) } else { facedat <- data.frame( argvals = argvals, @@ -67,7 +65,7 @@ predictP1.facemodel <- function(m.fit, ParentObject, DataStorageObject, subset_i assert_that(length(new_vals_idx) == length(new.Yvals)) assert_that(new_vals_idx[1] > length(fitted.Yvals)) - fpredict <- getFromNamespace("predict.face.sparse", "face") + fpredict <- utils::getFromNamespace("predict.face.sparse", "face") predict.res <- fpredict(model.object, new.face.dat) all.preds <- predict.res$y.pred assert_that(length(all.preds)==nrow(new.face.dat)) diff --git a/R/fit_main.R b/R/fit_main.R index 96b5e9a..7f59112 100644 --- a/R/fit_main.R +++ b/R/fit_main.R @@ -1,11 +1,11 @@ #' S3 methods for printing a collection of learners #' #' Prints the stack models -#' @param modelstack An object (list) of class ModelStack +#' @param x An object (list) of class ModelStack #' @param ... Additional options passed on to \code{print.PredictionModel}. #' @export -print.ModelStack <- function(modelstack, ...) { - str(modelstack) +print.ModelStack <- function(x, ...) { + str(x) return(invisible(NULL)) } @@ -102,7 +102,6 @@ fit <- function(...) { UseMethod("fit") } #' @param t_name A character string name of the column with integer-valued measurement time-points (in days, weeks, months, etc). #' @param x A vector containing the names of predictor variables to use for modeling. If x is missing, then all columns except \code{ID}, \code{y} are used. #' @param y A character string name of the column that represent the response variable in the model. -#' @param params Parameters specifying the type of modeling procedure to be used. #' @param nfolds Number of folds to use in cross-validation. #' @param fold_column The name of the column in the input data that contains the cross-validation fold indicators (must be an ordered factor). #' @param hold_column The name of the column that contains the holdout observation indicators (TRUE/FALSE) in the input data. diff --git a/R/summaryS3methods.R b/R/summaryS3methods.R index 92c030b..1a9b9de 100644 --- a/R/summaryS3methods.R +++ b/R/summaryS3methods.R @@ -8,6 +8,7 @@ NULL #' #' Prints a H2OBinomialMetrics object in Pandoc's markdown. #' @param H2OBinomialMetricsObject H2OBinomialMetrics object +#' @param type Character name specifying the type of metric (e.g., "training", "validation", "cross-validation") #' @return By default this function outputs (see: \code{?cat}) the result. #' If you would want to catch the result instead, then call \code{pander_return} instead. #' @export @@ -40,11 +41,6 @@ pander.H2OBinomialMetrics <- function(H2OBinomialMetricsObject, type) { # colnames(metricsDF) <- NULL pander::pander(metricsDF, justify = c('left', 'center'), caption = type %+% " data metrics" %+% "; Category: " %+% categor) - # cm <- try(h2o::h2o.confusionMatrix(H2OBinomialMetricsObject), silent = TRUE) - # if( !is.null(cm) ) { - # pander::pander(cm, caption = "Confusion Matrix for F1-optimal threshold" %+% " (Model ID: " %+% modelID %+%")" ) - # } - max_matrics <- h2o_metrics$max_criteria_and_metric_scores caption <- attributes(max_matrics)$header %+% ": " %+% attributes(max_matrics)$description pander::pander(max_matrics, caption = caption %+% " (Model ID: " %+% modelID %+%")") @@ -62,6 +58,7 @@ pander.H2OBinomialMetrics <- function(H2OBinomialMetricsObject, type) { #' #' Prints a H2ORegressionMetrics object in Pandoc's markdown. #' @param H2ORegressionMetricsObject H2ORegressionMetrics object +#' @param type Character name specifying the type of metric (e.g., "training", "validation", "cross-validation") #' @return By default this function outputs (see: \code{?cat}) the result. #' If you would want to catch the result instead, then call \code{pander_return} instead. #' @export @@ -86,43 +83,43 @@ pander.H2OGrid <- function(H2OGridObject) { #' S3 methods for printing model fit summary for glmfit class object #' #' Prints the modeling summary for the glm fit (\code{stats::glm.fit} or \code{speedglm::speedglm.wfit}) -#' @param model.fit The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm +#' @param x The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm #' @param ... Additional options passed on to \code{summary.GLMmodel}. #' @return The output is printed with \code{cat}. To capture the markdown-formated model summary use \code{summary.GLMmodel}. #' @export -print.GLMmodel <- function(model.fit, ...) { - model.summary <- summary(model.fit, ...) +print.GLMmodel <- function(x, ...) { + model.summary <- summary(x, ...) cat(paste(model.summary, collapse = '\n')) } #' S3 methods for fit summary for glmfit class #' #' Prints the modeling summary for the GLM fit (\code{stats::glm.fit} or \code{speedglm::speedglm.wfit}) -#' @param model.fit The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm +#' @param object The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm #' @param format_table Format the coefficients into a data.frame table? #' @param ... Additional options (not used) #' @return The markdown-formated model summary returned by \code{pander::pander_return}. #' @export -summary.GLMmodel <- function(model.fit, format_table = TRUE, ...) { - makeModelCaption <- function(model.fit) { +summary.GLMmodel <- function(object, format_table = TRUE, ...) { + makeModelCaption <- function(object) { return( - "Model: " %+% model.fit$params$outvar %+% " ~ " %+% paste0(model.fit$params$predvars, collapse = " + ") %+% "; \\ - Stratify: " %+% model.fit$params$stratify %+% "; \\ - N: " %+% prettyNum(model.fit$nobs, big.mark = ",", scientific = FALSE) %+% "; \\ - Fit function: " %+% model.fit$fitfunname + "Model: " %+% object$params$outvar %+% " ~ " %+% paste0(object$params$predvars, collapse = " + ") %+% "; \\ + Stratify: " %+% object$params$stratify %+% "; \\ + N: " %+% prettyNum(object$nobs, big.mark = ",", scientific = FALSE) %+% "; \\ + Fit function: " %+% object$fitfunname ) } - nobs <- model.fit$nobs - coef_out <- model.fit$coef + nobs <- object$nobs + coef_out <- object$coef if (format_table) { if (is.null(coef_out)) { coef_out <- "---"; names(coef_out) <- coef_out } coef_out <- data.table::data.table(Terms = names(coef_out), Coefficients = as.vector(coef_out)) - # coef_out <- data.frame(Terms = model.fit$params$predvars, Coefficients = as.vector(coef_out)) + # coef_out <- data.frame(Terms = object$params$predvars, Coefficients = as.vector(coef_out)) # rownames(coef_out) <- NULL } - pander::set.caption(makeModelCaption(model.fit)) + pander::set.caption(makeModelCaption(object)) out <- pander::pander_return(coef_out, justify = c('right', 'left')) out } @@ -130,45 +127,14 @@ summary.GLMmodel <- function(model.fit, format_table = TRUE, ...) { #' S3 methods for fit summary from xgboost #' #' Prints the modeling summary for the xgboost model fit (see \code{xgboost} R package). -#' @param xgb.model The model fit object produced by xgboost (and extracted with \code{getmodel_byname}). +#' @param object The model fit object produced by xgboost (and extracted with \code{getmodel_byname}). #' @param ... Additional options (not used) #' @return The markdown-formated model summary returned by \code{pander::pander_return}. #' @export -summary.xgb.Booster <- function(xgb.model, ...) { +summary.xgb.Booster <- function(object, ...) { out <- NULL - # out <- c(out, - # pander::pander_return(data.frame(function_call = as.character(xgb.model$call)[1], row.names = NULL)) - # ) - - # ----------------------------------------------------------------- - # some basic model info: - # ----------------------------------------------------------------- - # coef_summary_out <- summary.GLMmodel(model.fit, format_table) - # if (!is.null(xgb.model@model$coefficients_table)) { - # coef_summary_out <- pander::pander_return(xgb.model@model$coefficients_table, caption = attributes(xgb.model@model$coefficients_table)$description) - # out <- c(out, coef_summary_out) - # } - - # if (!only.coefs) { - # ----------------------------------------------------------------- - # model summary: - # ----------------------------------------------------------------- - # model_summary <- xgb.model@model$model_summary - # caption_summary <- attributes(model_summary)$header %+% " (Model ID: " %+% modelID %+%")" - # model_summary_out <- pander::pander_return(model_summary, caption = caption_summary) - # out <- c(out, model_summary_out) - - # out <- c(out, pander::pander_return(model_obj$call)) - - # ----------------------------------------------------------------- - # model parameters: - # ----------------------------------------------------------------- - # covars <- paste0(xgb.model@parameters$x, collapse = ",") - # predictors <- pander::pander_return(data.frame(predictors = covars)) - # out <- c(out, predictors) - - params <- xgb.model$params + params <- object$params params <- lapply(params, function(arg) if (length(arg) > 1) {paste0(arg, collapse = ",")} else {arg}) all_params <- t(data.table::as.data.table(params)) # all_params <- t(data.frame(params)) @@ -181,7 +147,7 @@ summary.xgb.Booster <- function(xgb.model, ...) { training_stats <- data.table::data.table( name = c("best_iteration", "best_ntreelimit", "niter"), - value = c(xgb.model$best_iteration, xgb.model$best_ntreelimit, xgb.model$niter) + value = c(object$best_iteration, object$best_ntreelimit, object$niter) ) training_stats_out <- pander::pander_return(training_stats, caption = "Model Training Stats") @@ -190,17 +156,17 @@ summary.xgb.Booster <- function(xgb.model, ...) { # ----------------------------------------------------------------- # model metrics (training and validation): # ----------------------------------------------------------------- - if (!is.null(xgb.model$best_score)) { - metric_name <- attr(xgb.model$best_score, "names") + if (!is.null(object$best_score)) { + metric_name <- attr(object$best_score, "names") performance <- data.table::data.table( metric_name = metric_name, - best_score = xgb.model$best_score) + best_score = object$best_score) performance_out <- pander::pander_return(performance, caption = "Model Performance") out <- c(out, performance_out) } - metrics <- xgb.model$evaluation_log + metrics <- object$evaluation_log if (nrow(metrics) > 10) metrics <- metrics[c(1:5, (nrow(metrics)-4):nrow(metrics)), ] model_metrics_out <- pander::pander_return(metrics, caption = "Model Performance By Iteration") out <- c(out, model_metrics_out) @@ -208,16 +174,16 @@ summary.xgb.Booster <- function(xgb.model, ...) { # ----------------------------------------------------------------- # variable importance: # ----------------------------------------------------------------- - feature_names <- xgb.model[["params"]][["feature_names"]] + feature_names <- object[["params"]][["feature_names"]] caption <- "Feature Importance" - if (class(xgb.model) %in% "xgb.cv.synchronous") { - xgb.model <- xgb.model[["models"]][[1]] + if (class(object) %in% "xgb.cv.synchronous") { + object <- object[["models"]][[1]] caption <- caption %+% " for Training Fold 1" } try({ - importance_matrix <- xgboost::xgb.importance(feature_names = feature_names, model = xgb.model) + importance_matrix <- xgboost::xgb.importance(feature_names = feature_names, model = object) importance_out <- pander::pander_return(importance_matrix, caption = caption) out <- c(out, importance_out) }, silent = TRUE) @@ -230,29 +196,29 @@ summary.xgb.Booster <- function(xgb.model, ...) { #' @rdname summary.xgb.Booster #' @export -summary.xgb.cv.synchronous <- function(xgb.model, ...) summary.xgb.Booster(xgb.model, ...) +summary.xgb.cv.synchronous <- function(object, ...) summary.xgb.Booster(object, ...) #' S3 methods for fit summary for h2o #' #' Prints the modeling summary for the h2o model fit (see \code{h2o} R package). -#' @param h2o.model The model fit object produced by h2o (and extracted with \code{getmodel_byname}). +#' @param object The model fit object produced by h2o (and extracted with \code{getmodel_byname}). #' @param only.coefs Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "glm"}). # format_table Format the coefficients into a data.frame table (when running \code{fit.algorithm = "glm"})? #' @param ... Additional options (not used) #' @return The markdown-formated model summary returned by \code{pander::pander_return}. #' @export -summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { +summary.H2ORegressionModel <- function(object, only.coefs = FALSE, ...) { - # h2o.model <- model.fit$H2O.model.object - modelID <- h2o.model@model$training_metrics@metrics$model$name + # object <- object$object.object + modelID <- object@model$training_metrics@metrics$model$name out <- NULL # ----------------------------------------------------------------- # some basic model info: # ----------------------------------------------------------------- - # coef_summary_out <- summary.GLMmodel(model.fit, format_table) - if (!is.null(h2o.model@model$coefficients_table)) { - coef_summary_out <- pander::pander_return(h2o.model@model$coefficients_table, caption = attributes(h2o.model@model$coefficients_table)$description) + # coef_summary_out <- summary.GLMmodel(object, format_table) + if (!is.null(object@model$coefficients_table)) { + coef_summary_out <- pander::pander_return(object@model$coefficients_table, caption = attributes(object@model$coefficients_table)$description) out <- c(out, coef_summary_out) } @@ -260,7 +226,7 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { # ----------------------------------------------------------------- # model summary: # ----------------------------------------------------------------- - model_summary <- h2o.model@model$model_summary + model_summary <- object@model$model_summary caption_summary <- attributes(model_summary)$header %+% " (Model ID: " %+% modelID %+%")" model_summary_out <- pander::pander_return(model_summary, caption = caption_summary) out <- c(out, model_summary_out) @@ -268,11 +234,11 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { # ----------------------------------------------------------------- # model parameters: # ----------------------------------------------------------------- - covars <- paste0(h2o.model@parameters$x, collapse = ",") + covars <- paste0(object@parameters$x, collapse = ",") predictors <- pander::pander_return(data.frame(predictors = covars)) out <- c(out, predictors) - params <- h2o.model@parameters[!names(h2o.model@parameters) %in% c("x", "model_id")] + params <- object@parameters[!names(object@parameters) %in% c("x", "model_id")] params <- lapply(params, function(arg) if (length(arg) > 1) {paste0(arg, collapse = ",")} else {arg}) all_params <- t(data.table::as.data.table(params)) @@ -288,13 +254,13 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { # ----------------------------------------------------------------- # training data metrics: # ----------------------------------------------------------------- - train_model_metrics_out <- pander::pander_return(h2o.model@model$training_metrics, type = "Training") + train_model_metrics_out <- pander::pander_return(object@model$training_metrics, type = "Training") out <- c(out, train_model_metrics_out) # ----------------------------------------------------------------- # validation data metrics: # ----------------------------------------------------------------- - H2OBinomialMetrics_val <- h2o.model@model$validation_metrics + H2OBinomialMetrics_val <- object@model$validation_metrics if (!is.null(H2OBinomialMetrics_val@metrics)) { valid_model_metrics_out <- pander::pander_return(H2OBinomialMetrics_val, type = "Validation") out <- c(out, valid_model_metrics_out) @@ -303,7 +269,7 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { # ----------------------------------------------------------------- # cross validation data metrics: # ----------------------------------------------------------------- - H2OBinomialMetrics_xval <- h2o.model@model$cross_validation_metrics + H2OBinomialMetrics_xval <- object@model$cross_validation_metrics if (!is.null(H2OBinomialMetrics_xval@metrics)) { xval_model_metrics_out <- pander::pander_return(H2OBinomialMetrics_xval, type = "Cross-validation") out <- c(out, xval_model_metrics_out) @@ -312,8 +278,8 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { # ----------------------------------------------------------------- # variable importance: # ----------------------------------------------------------------- - # h2o.varimp(h2o.model) - var_imp <- h2o.model@model$variable_importances + # h2o.varimp(object) + var_imp <- object@model$variable_importances var_imp_cap <- attributes(var_imp)$header %+% "Model ID: " %+% modelID %+%")" var_imp_out <- pander::pander_return(var_imp, caption = var_imp_cap) out <- c(out, var_imp_out) @@ -323,7 +289,7 @@ summary.H2ORegressionModel <- function(h2o.model, only.coefs = FALSE, ...) { #' @rdname summary.H2ORegressionModel #' @export -summary.H2OBinomialModel <- function(h2o.model, only.coefs = FALSE, ...) summary.H2ORegressionModel(h2o.model, only.coefs, ...) +summary.H2OBinomialModel <- function(object, only.coefs = FALSE, ...) summary.H2ORegressionModel(object, only.coefs, ...) #' S3 methods for printing model fit summaries as pander tables @@ -361,7 +327,7 @@ print_tables.xgb.Booster <- function(model, only.coefs = FALSE, ...) { } #' @rdname print_tables #' @export -print_tables.xgb.cv.synchronous <- function(model, only.coefs = FALSE, ...) print_tables.xgb.Booster(model, only.coefs, ..) +print_tables.xgb.cv.synchronous <- function(model, only.coefs = FALSE, ...) print_tables.xgb.Booster(model, only.coefs, ...) #' @rdname print_tables @@ -385,10 +351,7 @@ print_tables.GLMmodel <- function(model, only.coefs = FALSE, ...) { } CVmetrics_H2Obasemodel <- function(basemodelfit) { - # out <- NULL - # model_params <- t(data.frame(basemodelfit@parameters)) - # pander::pander_return(model_params, caption = "Base model parameters") - # str(basemodelfit@model$cross_validation_metrics) + CV <- basemodelfit@model$cross_validation_metrics@metrics cap <- CV$description CV.metrics.tab <- data.frame( @@ -402,43 +365,12 @@ CVmetrics_H2Obasemodel <- function(basemodelfit) { CV$Gini, CV$mean_per_class_error, CV$model_category) - # out <- c(out, pander::pander_return(CV.metrics.tab, caption = cap)) - # $residual_deviance - # $null_deviance - # $AIC - # $null_degrees_of_freedom - # $residual_degrees_of_freedom - # CVsummarytab <- basemodelfit@model$cross_validation_metrics_summary - # CVsummarytab <- CVsummarytab[, c("mean", "sd")] - # caption <- attributes(basemodelfit@model$cross_validation_metrics_summary)$header %+% ": " %+% basemodelfit@model_id - # out <- c(out, pander::pander_return(CVsummarytab, caption = caption)) + return(CV.metrics.tab) } CVsummary_H2Obasemodel <- function(basemodelfit) { out <- NULL - # model_params <- t(data.frame(basemodelfit@parameters)) - # pander::pander_return(model_params, caption = "Base model parameters") - # str(basemodelfit@model$cross_validation_metrics) - # CV <- basemodelfit@model$cross_validation_metrics@metrics - # cap <- CV$description - # CV.metrics.tab <- t(data.frame( - # model = basemodelfit@model_id, - # CV$nobs, - # CV$MSE, - # CV$RMSE, - # CV$logloss, - # CV$r2, - # CV$AUC, - # CV$Gini, - # CV$mean_per_class_error, - # CV$model_category)) - # out <- c(out, pander::pander_return(CV.metrics.tab, caption = cap)) - # $residual_deviance - # $null_deviance - # $AIC - # $null_degrees_of_freedom - # $residual_degrees_of_freedom CVsummarytab <- basemodelfit@model$cross_validation_metrics_summary CVsummarytab <- CVsummarytab[, c("mean", "sd")] caption <- attributes(basemodelfit@model$cross_validation_metrics_summary)$header %+% ": " %+% basemodelfit@model_id @@ -449,40 +381,22 @@ CVsummary_H2Obasemodel <- function(basemodelfit) { #' S3 methods for getting model fit summary for H2Oensemblemodel class object #' #' Prints the modeling summary for the h2o model fit (see \code{h2o} R package). -#' @param h2o.ensemble The model fit object produced by h2oEnsemble package +#' @param object The model fit object produced by h2oEnsemble package #' @param only.coefs Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "glm"}). #' @param format_table Format the coefficients into a data.frame table (when running \code{fit.algorithm = "glm"})? #' @param ... Additional options (not used) #' @return The markdown-formated model summary returned by \code{pander::pander_return}. #' @export -summary.H2Oensemblemodel <- function(h2o.ensemble, only.coefs = FALSE, format_table = TRUE, ...) { - # h2o.ensemble <- model.fit$H2O.model.object +summary.H2Oensemblemodel <- function(object, only.coefs = FALSE, format_table = TRUE, ...) { + # object <- object$H2O.model.object out <- NULL - x <- h2o.ensemble$x - y <- h2o.ensemble$y - family <- h2o.ensemble$family - learner <- h2o.ensemble$learner - metalearner <- h2o.ensemble$metalearner - Vfolds <- h2o.ensemble$cvControl$V - seed <- h2o.ensemble$seed - - # modelID <- h2o.ensemble@model$training_metrics@metrics$model$name - # str(h2o.ensemble) - # h2o.ensemble$basefits - # length(h2o.ensemble$basefits) - # h2o.ensemble$basefits[[1]] - # str(h2o.ensemble$basefits[[1]]) - - # metafit <- h2o.ensemble$metafit - # str(h2o.ensemble$metafit) - # h2o.ensemble$metafit@model$coefficients - # h2o.ensemble$metafit@model$model_summary - # metafit <- list() - # class(metafit) <- c(metafit, "H2Omodel") - # metafit$H2O.model.object <- h2o.ensemble$metafit - # metafit <- h2o.ensemble$metafit - # print("SuperLearner fit:"); print(metafit) - # str(h2o.ensemble) + x <- object$x + y <- object$y + family <- object$family + learner <- object$learner + metalearner <- object$metalearner + Vfolds <- object$cvControl$V + seed <- object$seed # ----------------------------------------------------------------- # some basic model info: @@ -492,9 +406,9 @@ summary.H2Oensemblemodel <- function(h2o.ensemble, only.coefs = FALSE, format_ta CV.descr.tab <- pander::pander_return(CV.descr, caption = "SuperLearner settings") out <- c(out, CV.descr.tab) - coef_summary_out <- summary.GLMmodel(model.fit, format_table) - if (!is.null(h2o.ensemble@model$coefficients_table)) { - coef_summary_out <- pander_return(h2o.ensemble@model$coefficients_table, caption = attributes(h2o.ensemble@model$coefficients_table)$description) + coef_summary_out <- summary.GLMmodel(object, format_table) + if (!is.null(object@model$coefficients_table)) { + coef_summary_out <- pander::pander_return(object@model$coefficients_table, caption = attributes(object@model$coefficients_table)$description) out <- c(out, coef_summary_out) } @@ -504,8 +418,8 @@ summary.H2Oensemblemodel <- function(h2o.ensemble, only.coefs = FALSE, format_ta # ----------------------------------------------------------------- # model summary: # ----------------------------------------------------------------- - # str(h2o.ensemble$metafit@model$model_summary) - model_summary <- h2o.ensemble$metafit@model$model_summary + # str(object$metafit@model$model_summary) + model_summary <- object$metafit@model$model_summary caption_summary <- attributes(model_summary)$header %+% " Wrapper: " %+% metalearner model_summary_out <- pander::pander_return(model_summary, caption = caption_summary) out <- c(out, model_summary_out) @@ -514,27 +428,13 @@ summary.H2Oensemblemodel <- function(h2o.ensemble, only.coefs = FALSE, format_ta # CV metrics for each base learner # ----------------------------------------------------------------- CVmetrics_tab <- NULL - for (basemodel in h2o.ensemble$basefits) { + for (basemodel in object$basefits) { CVmetrics_tab <- rbind(CVmetrics_tab, CVmetrics_H2Obasemodel(basemodel)) # out <- c(out, CVsummary_H2Obasemodel(basemodel)) } cap <- basemodel@model$cross_validation_metrics@metrics$description out <- c(out, pander::pander_return(CVmetrics_tab, caption = cap)) - # ----------------------------------------------------------------- - # training data metrics: - # ----------------------------------------------------------------- - # H2OBinomialMetrics_training <- h2o.ensemble@model$training_metrics - # train_model_metrics_out <- pander::pander_return(H2OBinomialMetrics_training) - # out <- c(out, train_model_metrics_out) - - # ----------------------------------------------------------------- - # variable importance: - # ----------------------------------------------------------------- - # var_imp <- h2o.ensemble@model$variable_importances - # var_imp_cap <- attributes(var_imp)$header %+% "Model ID: " %+% modelID %+%")" - # var_imp_out <- pander::pander_return(var_imp, caption = var_imp_cap) - # out <- c(out, var_imp_out) } return(out) } @@ -542,12 +442,11 @@ summary.H2Oensemblemodel <- function(h2o.ensemble, only.coefs = FALSE, format_ta #' S3 methods for printing model fit summary for H2Omodel class object #' #' Prints the modeling summary for the h2o model fit (see \code{h2o} R package). -#' @param model.fit The model fit object produced by any stremr S3 function starting with \code{stremr:::H2Omodel.} -#' @param only.coefs Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "GLM"}). +#' @param x The model fit object produced by any stremr S3 function starting with \code{stremr:::H2Omodel.} #' @param ... Additional options passed on to \code{summary.H2Omodel}. #' @return The output is printed with \code{cat}. To capture the markdown-formated model summary use \code{summary.H2Omodel}. #' @export -print.H2Oensemblemodel <- function(model.fit, only.coefs = FALSE, ...) { - model.summary <- summary(model.fit, only.coefs, ...) +print.H2Oensemblemodel <- function(x, ...) { + model.summary <- summary(x, ...) cat(paste(model.summary, collapse = '\n')) } diff --git a/R/utility_funs.R b/R/utility_funs.R index 2ed0e6c..d951808 100644 --- a/R/utility_funs.R +++ b/R/utility_funs.R @@ -1,5 +1,7 @@ #' @useDynLib gridisl #' @import R6 +#' @import ggplot2 +#' @import ggiraph #' @importFrom magrittr %>% #' @importFrom Rcpp sourceCpp #' @importFrom graphics axis barplot hist par text legend plot @@ -34,6 +36,7 @@ capture.exprs <- function(...) { #' Save the best performing h2o model #' #' @param modelfit A model object of class \code{PredictionModel} returned by functions \code{fit_model} or \code{fit}. +#' @param file.path Specify the directory where the model object file should be saved. #' @export save_best_model <- function(modelfit, file.path = getOption('gridisl.file.path')) { stop("...not implemented...") @@ -44,11 +47,11 @@ save_best_model <- function(modelfit, file.path = getOption('gridisl.file.path') ## If CV SL was used this model is equivalent to the best model trained on all data ## However, for holdout SL this model will be trained only on non-holdout observations best_model_traindat <- modelfit$get_best_models(K = 1)[[1]] - h2o.saveModel(best_model_traindat, file.path, force = TRUE) + h2o::h2o.saveModel(best_model_traindat, file.path, force = TRUE) ## This model is always trained on all data (if exists) best_model_alldat <- modelfit$BestModelFitObject$model.fit$modelfits_all if (!is.null(best_model_alldat)) - h2o.saveModel(best_model_alldat[[1]], file.path, force = TRUE) + h2o::h2o.saveModel(best_model_alldat[[1]], file.path, force = TRUE) return(invisible(NULL)) } @@ -146,7 +149,7 @@ add_holdout_ind = function(data, ID, hold_column = "hold", random = TRUE, seed = importData <- function(data, ID = "Subject_ID", t_name = "time_period", covars, OUTCOME = "Y", verbose = getOption("gridisl.verbose")) { gvars$verbose <- verbose # if (verbose) { - # current.options <- capture.output(str(gvars$opts)) + # current.options <- utils::capture.output(str(gvars$opts)) # print("Using the following gridisl options/settings: ") # cat('\n') # cat(paste0(current.options, collapse = '\n'), '\n') @@ -264,17 +267,30 @@ CheckVarNameExists <- function(data, varname) { return(invisible(NULL)) } + +if(getRversion() >= "2.15.1") { + utils::globalVariables(c("model", + "MSE", + "CIlow", + "CIhi", + "algorithm", + "tooltip", + "model.id", + "onclick")) +} + # --------------------------------------------------------------------------------------- -#' Plot the top K smallest MSEs for a given model ensemble object. +#' Plot the top (smallest) validation MSEs for an ensemble of prediction models #' #' @param PredictionModel Must be an R6 object of class \code{PredictionModel} (returned by \code{get_fit} function) #' or an object of class \code{PredictionStack} (returned by \code{make_PredictionStack} function). #' Must also contain validation /test set predictions and corresponding MSEs. #' @param K How many top (smallest) MSEs should be plotted? Default is 5. +#' @param interactive Setting this to \code{TRUE} will produce an interactive plot in html format using the package \code{ggiraph}. #' @export plotMSEs <- function(PredictionModel, K = 1, interactive = FALSE) { - # require("ggplot2") - require("ggiraph") + require("ggplot2") + # require("ggiraph") assert_that(is.PredictionModel(PredictionModel) || is.PredictionStack(PredictionModel)) assert_that(is.integerish(K)) @@ -296,15 +312,15 @@ plotMSEs <- function(PredictionModel, K = 1, interactive = FALSE) { p <- p + geom_point_interactive(aes(color = algorithm, tooltip = tooltip, data_id = model.id, onclick = onclick), size = 2, position = position_dodge(0.01)) # alpha = 0.8 # p <- p + geom_point_interactive(aes(color = algorithm, tooltip = model.id, data_id = model.id, onclick = onclick), size = 2, position = position_dodge(0.01)) # alpha = 0.8 } else { - p <- p + geom_point(aes(color = algorithm), size = 2, position = position_dodge(0.01)) # alpha = 0.8 + p <- p + geom_point(aes(color = algorithm), size = 2, position = ggplot2::position_dodge(0.01)) # alpha = 0.8 } - p <- p + geom_errorbar(aes(color = algorithm), width = 0.2, position = position_dodge(0.01)) - p <- p + theme_bw() + coord_flip() + p <- p + ggplot2::geom_errorbar(aes(color = algorithm), width = 0.2, position = ggplot2::position_dodge(0.01)) + p <- p + ggplot2::theme_bw() + ggplot2::coord_flip() if (interactive){ - ggiraph(code = print(p), width = .6, - tooltip_extra_css = "padding:2px;background:rgba(70,70,70,0.1);color:black;border-radius:2px 2px 2px 2px;", - hover_css = "fill:#1279BF;stroke:#1279BF;cursor:pointer;" + ggiraph::ggiraph(code = print(p), width = .6, + tooltip_extra_css = "padding:2px;background:rgba(70,70,70,0.1);color:black;border-radius:2px 2px 2px 2px;", + hover_css = "fill:#1279BF;stroke:#1279BF;cursor:pointer;" ) # to active zoom on a plot: # zoom_max = 2 diff --git a/R/wrappers.R b/R/wrappers.R deleted file mode 100644 index a7bdcfb..0000000 --- a/R/wrappers.R +++ /dev/null @@ -1,393 +0,0 @@ -# Set of default wrappers to create a uniform interface for h2o supervised ML functions (H2O 3.0 and above) -# These wrapper functions should always be compatible with the master branch of: https://github.com/h2oai/h2o-3 -# See the ensemble README for a full wrapper compatibility chart - -# Example of a wrapper function: -#' @export -h2o.example.wrapper <- function(x, y, training_frame, model_id = NULL, family = c("gaussian", "binomial"), ...) { - # This function is just an example. - # You can wrap any H2O learner inside a wrapper function, example: h2o.glm - h2o.glm(x = x, y = y, training_frame = training_frame, family = family) -} - -# ------------------------------------------------------------------------------------- -# Wrappers for: h2o.glm, h2o.randomForest, h2o.gbm, h2o.deeplearning: -# ------------------------------------------------------------------------------------- - -# This is a version of the h2o.glm.wrapper which doesn't pass along all the args -# Use this version until this is resolved: https://0xdata.atlassian.net/browse/PUBDEV-1558 -# beta_constraints currently causing a bug: https://0xdata.atlassian.net/browse/PUBDEV-1556 -#' @export -h2o.glm.wrapper <- function(x, y, training_frame, model_id = NULL, - validation_frame = NULL, - ignore_const_cols = TRUE, - max_iterations = 50, - beta_epsilon = 0, - solver = c("IRLSM", "L_BFGS"), - standardize = TRUE, - family = c("gaussian", "binomial", "poisson", "gamma", "tweedie"), - link = c("family_default", "identity", "logit", "log", "inverse", "tweedie"), - tweedie_variance_power = 0, - tweedie_link_power = -1, - alpha = 0.5, - prior = NULL, - lambda = 1e-05, - lambda_search = FALSE, - nlambdas = NULL, - lambda_min_ratio = NULL, - nfolds = 0, - fold_column = NULL, - fold_assignment = c("AUTO", "Random", "Modulo"), - keep_cross_validation_predictions = TRUE, - keep_cross_validation_fold_assignment = TRUE, - beta_constraints = NULL, - offset_column = NULL, - weights_column = NULL, - intercept = TRUE, - max_active_predictors = NULL, - objective_epsilon = NULL, - gradient_epsilon = NULL, - non_negative = FALSE, - compute_p_values = FALSE, - remove_collinear_columns = FALSE, - max_runtime_secs = 0, - missing_values_handling = c("MeanImputation", "Skip"), ...) { - - # Also, offset_column, weights_column, intercept not implemented at the moment due to similar bug as beta_constraints - h2o.glm(x = x, y = y, training_frame = training_frame, model_id = model_id, - validation_frame = validation_frame, - ignore_const_cols = ignore_const_cols, - max_iterations = max_iterations, - beta_epsilon = beta_epsilon, - solver = match.arg(solver), - standardize = standardize, - family = match.arg(family), - link = match.arg(link), - tweedie_variance_power = tweedie_variance_power, - tweedie_link_power = tweedie_link_power, - alpha = alpha, - prior = prior, - lambda = lambda, - lambda_search = lambda_search, - nlambdas = nlambdas, - lambda_min_ratio = lambda_min_ratio, - nfolds = nfolds, - fold_column = fold_column, - fold_assignment = match.arg(fold_assignment), - keep_cross_validation_predictions = keep_cross_validation_predictions, - keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment, - #beta_constraints = beta_constraints, - #offset_column = offset_column, - #weights_column = weights_column, - #intercept = intercept, - max_active_predictors = max_active_predictors, - objective_epsilon = objective_epsilon, - gradient_epsilon = gradient_epsilon, - non_negative = non_negative, - compute_p_values = compute_p_values, - remove_collinear_columns = remove_collinear_columns, - max_runtime_secs = max_runtime_secs, - missing_values_handling = match.arg(missing_values_handling)) -} - - -#' @export -h2o.gbm.wrapper <- function(x, y, training_frame, model_id = NULL, # TO DO: add checkpoint? - ignore_const_cols = TRUE, - family = c("AUTO", "gaussian", "bernoulli", "binomial", "multinomial", "poisson", "gamma", "tweedie"), - quantile_alpha = 0.5, - tweedie_power = 1.5, - ntrees = 50, - max_depth = 5, - min_rows = 10, - learn_rate = 0.1, - sample_rate = 1, - col_sample_rate = 1, - col_sample_rate_per_tree = 1, - nbins = 20, - nbins_top_level, # TO DO: Add - nbins_cats = 1024, - validation_frame = NULL, - balance_classes = FALSE, - max_after_balance_size = 1, - seed, - build_tree_one_node = FALSE, - nfolds = 0, - fold_column = NULL, - fold_assignment = c("AUTO", "Random", "Modulo"), - keep_cross_validation_predictions = TRUE, - keep_cross_validation_fold_assignment = TRUE, - score_each_iteration = FALSE, - score_tree_interval = 0, - stopping_rounds = 0, - stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "AUC", "misclassification", "mean_per_class_error"), - stopping_tolerance = 0.001, - max_runtime_secs = 0, - offset_column = NULL, - weights_column = NULL, ...) { - - family <- match.arg(family) - if (family == "binomial") { - distribution <- "bernoulli" - } else if (family %in% c("AUTO","gaussian", "bernoulli","multinomial","poisson","gamma","tweedie")) { - distribution <- family - } else { - stop("`family` not supported in `h2o.gbm.wrapper`") - } - h2o.gbm(x = x, y = y, training_frame = training_frame, model_id = model_id, - ignore_const_cols = ignore_const_cols, - distribution = distribution, - quantile_alpha = quantile_alpha, - tweedie_power = tweedie_power, - ntrees = ntrees, - max_depth = max_depth, - min_rows = min_rows, - learn_rate = learn_rate, - sample_rate = sample_rate, - col_sample_rate = col_sample_rate, - col_sample_rate_per_tree = col_sample_rate_per_tree, - nbins = nbins, - nbins_top_level = nbins_top_level, - nbins_cats = nbins_cats, - validation_frame = validation_frame, - balance_classes = balance_classes, - max_after_balance_size = max_after_balance_size, - seed = seed, - build_tree_one_node = build_tree_one_node, - nfolds = nfolds, - fold_column = fold_column, - fold_assignment = match.arg(fold_assignment), - keep_cross_validation_predictions = keep_cross_validation_predictions, - keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment, - score_each_iteration = score_each_iteration, - score_tree_interval = score_tree_interval, - stopping_rounds = stopping_rounds, - stopping_metric = match.arg(stopping_metric), - stopping_tolerance = stopping_tolerance, - max_runtime_secs = max_runtime_secs, - offset_column = offset_column, - weights_column = weights_column) -} - -#' @export -h2o.randomForest.wrapper <- function(x, y, training_frame, model_id = NULL, - validation_frame = NULL, - ignore_const_cols = TRUE, # TO DO: checkpoint? - family = c("binomial", "multinomial", "gaussian"), - mtries = -1, - sample_rate = 0.632, - col_sample_rate_per_tree = 1, - build_tree_one_node = FALSE, - ntrees = 50, - max_depth = 20, - min_rows = 1, - nbins = 20, - nbins_top_level, # TO DO: Add default - nbins_cats = 1024, - binomial_double_trees = FALSE, - balance_classes = FALSE, - max_after_balance_size = 5, - seed, - offset_column = NULL, - weights_column = NULL, - nfolds = 0, - fold_column = NULL, - fold_assignment = c("AUTO", "Random", "Modulo"), - keep_cross_validation_predictions = TRUE, - keep_cross_validation_fold_assignment = TRUE, - score_each_iteration = FALSE, - score_tree_interval = 0, - stopping_rounds = 0, - stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "AUC", "misclassification", "mean_per_class_error"), - stopping_tolerance = 0.001, - max_runtime_secs = 0, ...) { - - # Currently ignoring the `family` arg, will get class from outcome in H2OFrame - # TO DO: Add a check to make sure that outcome/family type is consistent with specified family - h2o.randomForest(x = x, y = y, training_frame = training_frame, model_id = model_id, - validation_frame = validation_frame, - ignore_const_cols = ignore_const_cols, - mtries = mtries, - sample_rate = sample_rate, - col_sample_rate_per_tree = col_sample_rate_per_tree, - build_tree_one_node = build_tree_one_node, - ntrees = ntrees, - max_depth = max_depth, - min_rows = min_rows, - nbins = nbins, - nbins_top_level = nbins_top_level, - nbins_cats = nbins_cats, - binomial_double_trees = binomial_double_trees, - balance_classes = balance_classes, - max_after_balance_size = max_after_balance_size, - seed = seed, - offset_column = offset_column, - weights_column = weights_column, - nfolds = nfolds, - fold_column = fold_column, - fold_assignment = match.arg(fold_assignment), - keep_cross_validation_predictions = keep_cross_validation_predictions, - keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment, - score_each_iteration = score_each_iteration, - score_tree_interval = score_tree_interval, - stopping_rounds = stopping_rounds, - stopping_metric = match.arg(stopping_metric), - stopping_tolerance = stopping_tolerance, - max_runtime_secs = max_runtime_secs) -} - -#' @export -h2o.deeplearning.wrapper <- function(x, y, training_frame, model_id = NULL, - family = c("binomial", "multinomial", "gaussian"), - overwrite_with_best_model, - validation_frame = NULL, - checkpoint, - autoencoder = FALSE, - use_all_factor_levels = TRUE, - standardize = TRUE, - activation = c("Rectifier", "Tanh", "TanhWithDropout", - "RectifierWithDropout", "Maxout", "MaxoutWithDropout"), - hidden = c(200, 200), - epochs = 10, - train_samples_per_iteration = -2, - target_ratio_comm_to_comp = 0.05, #not on stable yet - seed, - adaptive_rate = TRUE, - rho = 0.99, - epsilon = 1e-08, - rate = 0.005, - rate_annealing = 1e-06, - rate_decay = 1, - momentum_start = 0, - momentum_ramp = 1e+06, - momentum_stable = 0, - nesterov_accelerated_gradient = TRUE, - input_dropout_ratio = 0, - hidden_dropout_ratios, - l1 = 0, - l2 = 0, - max_w2 = Inf, - initial_weight_distribution = c("UniformAdaptive", "Uniform", "Normal"), - initial_weight_scale = 1, - loss = c("Automatic", "CrossEntropy", "Quadratic", "Absolute", "Huber"), - distribution = c("AUTO", "gaussian", "bernoulli", "multinomial", - "poisson", "gamma", "tweedie", "laplace", "huber"), - quantile_alpha = 0.5, - tweedie_power = 1.5, - score_interval = 5, - score_training_samples, - score_validation_samples, - score_duty_cycle, - classification_stop, - regression_stop, - stopping_rounds = 5, - stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "AUC", "misclassification", "mean_per_class_error"), - stopping_tolerance = 0, - max_runtime_secs = 0, - quiet_mode, - max_confusion_matrix_size, - max_hit_ratio_k, - balance_classes = FALSE, - class_sampling_factors, - max_after_balance_size, - score_validation_sampling, - diagnostics, - variable_importances, - fast_mode, - ignore_const_cols, - force_load_balance, - replicate_training_data, - single_node_mode, - shuffle_training_data, - sparse, - col_major, - average_activation, - sparsity_beta, - max_categorical_features, - reproducible = FALSE, - export_weights_and_biases = FALSE, - offset_column = NULL, - weights_column = NULL, - nfolds = 0, - fold_column = NULL, - fold_assignment = c("AUTO", "Random", "Modulo"), - keep_cross_validation_predictions = TRUE, - keep_cross_validation_fold_assignment = TRUE, - missing_values_handling = c("MeanImputation", "Skip"), ...) { - - # Currently ignoring the `family` arg, will get class from outcome in H2OFrame - h2o.deeplearning(x = x, y = y, training_frame = training_frame, model_id = model_id, - overwrite_with_best_model = overwrite_with_best_model, - validation_frame = validation_frame, - checkpoint = checkpoint, - autoencoder = autoencoder, - use_all_factor_levels = use_all_factor_levels, - standardize = standardize, - activation = match.arg(activation), - hidden = hidden, - epochs = epochs, - train_samples_per_iteration = train_samples_per_iteration, - target_ratio_comm_to_comp = target_ratio_comm_to_comp, - seed = seed, - adaptive_rate = adaptive_rate, - rho = rho, - epsilon = epsilon, - rate = rate, - rate_annealing = rate_annealing, - rate_decay = rate_decay, - momentum_start = momentum_start, - momentum_ramp = momentum_ramp, - momentum_stable = momentum_stable, - nesterov_accelerated_gradient = nesterov_accelerated_gradient, - input_dropout_ratio = input_dropout_ratio, - hidden_dropout_ratios = hidden_dropout_ratios, - l1 = l1, - l2 = l2, - max_w2 = max_w2, - initial_weight_distribution = match.arg(initial_weight_distribution), - initial_weight_scale = initial_weight_scale, - loss = match.arg(loss), - distribution = match.arg(distribution), - quantile_alpha = quantile_alpha, - tweedie_power = tweedie_power, - score_interval = score_interval, - score_training_samples = score_training_samples, - score_validation_samples = score_validation_samples, - score_duty_cycle = score_duty_cycle, - classification_stop = classification_stop, - regression_stop = regression_stop, - stopping_rounds = stopping_rounds, - stopping_metric = match.arg(stopping_metric), - stopping_tolerance = stopping_tolerance, - max_runtime_secs = max_runtime_secs, - quiet_mode = quiet_mode, - max_confusion_matrix_size = max_confusion_matrix_size, - max_hit_ratio_k = max_hit_ratio_k, - balance_classes = balance_classes, - class_sampling_factors = class_sampling_factors, - max_after_balance_size = max_after_balance_size, - score_validation_sampling = score_validation_sampling, - diagnostics = diagnostics, - variable_importances = variable_importances, - fast_mode = fast_mode, - ignore_const_cols = ignore_const_cols, - force_load_balance = force_load_balance, - replicate_training_data = replicate_training_data, - single_node_mode = single_node_mode, - shuffle_training_data = shuffle_training_data, - sparse = sparse, - col_major = col_major, - average_activation = average_activation, - sparsity_beta = sparsity_beta, - max_categorical_features = max_categorical_features, - reproducible = reproducible, - export_weights_and_biases = export_weights_and_biases, - offset_column = offset_column, - weights_column = weights_column, - nfolds = nfolds, - fold_column = fold_column, - fold_assignment = match.arg(fold_assignment), - keep_cross_validation_predictions = keep_cross_validation_predictions, - keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment, - missing_values_handling = match.arg(missing_values_handling)) -} \ No newline at end of file diff --git a/inst/report/report-script-rmd.R b/inst/report/report-script-rmd.R index 2ef5348..8a4dc24 100644 --- a/inst/report/report-script-rmd.R +++ b/inst/report/report-script-rmd.R @@ -126,7 +126,7 @@ # for (single_model in models$modelfits_all) { # # print(models, only.coefs = only.coefs) # print(single_model) - # # res <- capture.output(single_models) + # # res <- utils::capture.output(single_models) # # pander(print(paste(res, collapse = '\n'))) # } } diff --git a/man/XGBoostClass.Rd b/man/XGBoostClass.Rd index 505c9b3..4b1d116 100644 --- a/man/XGBoostClass.Rd +++ b/man/XGBoostClass.Rd @@ -27,7 +27,7 @@ This R6 class can request, store and manage the design matrix Xmat, as well as t \section{Methods}{ \describe{ - \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a + \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a design matrix and binary outcome.} \item{\code{setdata()}}{...} } diff --git a/man/fit.ModelStack.Rd b/man/fit.ModelStack.Rd index e5a4e3d..2ff971e 100644 --- a/man/fit.ModelStack.Rd +++ b/man/fit.ModelStack.Rd @@ -49,8 +49,6 @@ If FALSE then the last observation for each subject is selected as a holdout.} When \code{FALSE}, it might be impossible to make predictions from this model fit.} \item{verbose}{Set to \code{TRUE} to print messages on status and information to the console. Turn this on by default using \code{options(gridisl.verbose=TRUE)}.} - -\item{params}{Parameters specifying the type of modeling procedure to be used.} } \value{ An R6 object containing the model fit(s). diff --git a/man/glmModelClass.Rd b/man/glmModelClass.Rd index cc30054..ac2f74d 100644 --- a/man/glmModelClass.Rd +++ b/man/glmModelClass.Rd @@ -27,7 +27,7 @@ This R6 class can request, store and manage the design matrix Xmat, as well as t \section{Methods}{ \describe{ - \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a + \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a design matrix and binary outcome.} \item{\code{setdata()}}{...} } diff --git a/man/pander.H2OBinomialMetrics.Rd b/man/pander.H2OBinomialMetrics.Rd index ac60274..094d800 100644 --- a/man/pander.H2OBinomialMetrics.Rd +++ b/man/pander.H2OBinomialMetrics.Rd @@ -8,6 +8,8 @@ } \arguments{ \item{H2OBinomialMetricsObject}{H2OBinomialMetrics object} + +\item{type}{Character name specifying the type of metric (e.g., "training", "validation", "cross-validation")} } \value{ By default this function outputs (see: \code{?cat}) the result. diff --git a/man/pander.H2ORegressionMetrics.Rd b/man/pander.H2ORegressionMetrics.Rd index c123e8d..a890ea9 100644 --- a/man/pander.H2ORegressionMetrics.Rd +++ b/man/pander.H2ORegressionMetrics.Rd @@ -8,6 +8,8 @@ } \arguments{ \item{H2ORegressionMetricsObject}{H2ORegressionMetrics object} + +\item{type}{Character name specifying the type of metric (e.g., "training", "validation", "cross-validation")} } \value{ By default this function outputs (see: \code{?cat}) the result. diff --git a/man/plotMSEs.Rd b/man/plotMSEs.Rd index 0c1b1f5..d94e3c9 100644 --- a/man/plotMSEs.Rd +++ b/man/plotMSEs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/utility_funs.R \name{plotMSEs} \alias{plotMSEs} -\title{Plot the top K smallest MSEs for a given model ensemble object.} +\title{Plot the top (smallest) validation MSEs for an ensemble of prediction models} \usage{ plotMSEs(PredictionModel, K = 1, interactive = FALSE) } @@ -12,8 +12,10 @@ or an object of class \code{PredictionStack} (returned by \code{make_PredictionS Must also contain validation /test set predictions and corresponding MSEs.} \item{K}{How many top (smallest) MSEs should be plotted? Default is 5.} + +\item{interactive}{Setting this to \code{TRUE} will produce an interactive plot in html format using the package \code{ggiraph}.} } \description{ -Plot the top K smallest MSEs for a given model ensemble object. +Plot the top (smallest) validation MSEs for an ensemble of prediction models } diff --git a/man/print.GLMmodel.Rd b/man/print.GLMmodel.Rd index de66319..0817f4a 100644 --- a/man/print.GLMmodel.Rd +++ b/man/print.GLMmodel.Rd @@ -4,10 +4,10 @@ \alias{print.GLMmodel} \title{S3 methods for printing model fit summary for glmfit class object} \usage{ -\method{print}{GLMmodel}(model.fit, ...) +\method{print}{GLMmodel}(x, ...) } \arguments{ -\item{model.fit}{The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm} +\item{x}{The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm} \item{...}{Additional options passed on to \code{summary.GLMmodel}.} } diff --git a/man/print.H2Oensemblemodel.Rd b/man/print.H2Oensemblemodel.Rd index fd7bc30..e56c35c 100644 --- a/man/print.H2Oensemblemodel.Rd +++ b/man/print.H2Oensemblemodel.Rd @@ -4,12 +4,10 @@ \alias{print.H2Oensemblemodel} \title{S3 methods for printing model fit summary for H2Omodel class object} \usage{ -\method{print}{H2Oensemblemodel}(model.fit, only.coefs = FALSE, ...) +\method{print}{H2Oensemblemodel}(x, ...) } \arguments{ -\item{model.fit}{The model fit object produced by any stremr S3 function starting with \code{stremr:::H2Omodel.}} - -\item{only.coefs}{Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "GLM"}).} +\item{x}{The model fit object produced by any stremr S3 function starting with \code{stremr:::H2Omodel.}} \item{...}{Additional options passed on to \code{summary.H2Omodel}.} } diff --git a/man/print.ModelStack.Rd b/man/print.ModelStack.Rd index f27a2b6..02cffd0 100644 --- a/man/print.ModelStack.Rd +++ b/man/print.ModelStack.Rd @@ -4,10 +4,10 @@ \alias{print.ModelStack} \title{S3 methods for printing a collection of learners} \usage{ -\method{print}{ModelStack}(modelstack, ...) +\method{print}{ModelStack}(x, ...) } \arguments{ -\item{modelstack}{An object (list) of class ModelStack} +\item{x}{An object (list) of class ModelStack} \item{...}{Additional options passed on to \code{print.PredictionModel}.} } diff --git a/man/print.PredictionStack.Rd b/man/print.PredictionStack.Rd index d35abdd..0969acc 100644 --- a/man/print.PredictionStack.Rd +++ b/man/print.PredictionStack.Rd @@ -4,15 +4,10 @@ \alias{print.PredictionStack} \title{S3 methods for printing model fit summary for PredictionModel R6 class object} \usage{ -\method{print}{PredictionStack}(modelstack, model_stats = FALSE, - all_fits = FALSE, ...) +\method{print}{PredictionStack}(x, ...) } \arguments{ -\item{modelstack}{The model fit object produced by functions \code{make_PredictionStack}.} - -\item{model_stats}{Also print some model summaries?} - -\item{all_fits}{Print all of the modeling fits contained in this object? Warning: this may produce a lot of output!} +\item{x}{The model fit object produced by functions \code{make_PredictionStack}.} \item{...}{Additional options passed on to \code{print.PredictionModel}.} } diff --git a/man/print.brokenstickmodel.Rd b/man/print.brokenstickmodel.Rd index c162970..172607b 100644 --- a/man/print.brokenstickmodel.Rd +++ b/man/print.brokenstickmodel.Rd @@ -4,10 +4,10 @@ \alias{print.brokenstickmodel} \title{S3 methods for printing model fit summary for brokenstickmodel class object} \usage{ -\method{print}{brokenstickmodel}(model.fit, ...) +\method{print}{brokenstickmodel}(x, ...) } \arguments{ -\item{model.fit}{The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm} +\item{x}{The model fit object produced by functions stremr:::fit.glm or stremr:::fit.speedglm} \item{...}{Additional options passed on to \code{summary.GLMmodel}.} } diff --git a/man/save_best_model.Rd b/man/save_best_model.Rd index b59531c..e2ca55f 100644 --- a/man/save_best_model.Rd +++ b/man/save_best_model.Rd @@ -8,6 +8,8 @@ save_best_model(modelfit, file.path = getOption("gridisl.file.path")) } \arguments{ \item{modelfit}{A model object of class \code{PredictionModel} returned by functions \code{fit_model} or \code{fit}.} + +\item{file.path}{Specify the directory where the model object file should be saved.} } \description{ Save the best performing h2o model diff --git a/man/summary.GLMmodel.Rd b/man/summary.GLMmodel.Rd index dddd3a5..19174b1 100644 --- a/man/summary.GLMmodel.Rd +++ b/man/summary.GLMmodel.Rd @@ -4,10 +4,10 @@ \alias{summary.GLMmodel} \title{S3 methods for fit summary for glmfit class} \usage{ -\method{summary}{GLMmodel}(model.fit, format_table = TRUE, ...) +\method{summary}{GLMmodel}(object, format_table = TRUE, ...) } \arguments{ -\item{model.fit}{The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm} +\item{object}{The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm} \item{format_table}{Format the coefficients into a data.frame table?} diff --git a/man/summary.H2ORegressionModel.Rd b/man/summary.H2ORegressionModel.Rd index 67b8ff6..45f56f9 100644 --- a/man/summary.H2ORegressionModel.Rd +++ b/man/summary.H2ORegressionModel.Rd @@ -5,12 +5,12 @@ \alias{summary.H2ORegressionModel} \title{S3 methods for fit summary for h2o} \usage{ -\method{summary}{H2ORegressionModel}(h2o.model, only.coefs = FALSE, ...) +\method{summary}{H2ORegressionModel}(object, only.coefs = FALSE, ...) -\method{summary}{H2OBinomialModel}(h2o.model, only.coefs = FALSE, ...) +\method{summary}{H2OBinomialModel}(object, only.coefs = FALSE, ...) } \arguments{ -\item{h2o.model}{The model fit object produced by h2o (and extracted with \code{getmodel_byname}).} +\item{object}{The model fit object produced by h2o (and extracted with \code{getmodel_byname}).} \item{only.coefs}{Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "glm"}).} diff --git a/man/summary.H2Oensemblemodel.Rd b/man/summary.H2Oensemblemodel.Rd index 19b8e76..37a1dac 100644 --- a/man/summary.H2Oensemblemodel.Rd +++ b/man/summary.H2Oensemblemodel.Rd @@ -4,11 +4,11 @@ \alias{summary.H2Oensemblemodel} \title{S3 methods for getting model fit summary for H2Oensemblemodel class object} \usage{ -\method{summary}{H2Oensemblemodel}(h2o.ensemble, only.coefs = FALSE, +\method{summary}{H2Oensemblemodel}(object, only.coefs = FALSE, format_table = TRUE, ...) } \arguments{ -\item{h2o.ensemble}{The model fit object produced by h2oEnsemble package} +\item{object}{The model fit object produced by h2oEnsemble package} \item{only.coefs}{Skip all of the h2o-specific model stats, only print the coefficients table (when running \code{fit.algorithm = "glm"}).} diff --git a/man/summary.brokenstickmodel.Rd b/man/summary.brokenstickmodel.Rd index de6ec8f..590707c 100644 --- a/man/summary.brokenstickmodel.Rd +++ b/man/summary.brokenstickmodel.Rd @@ -4,10 +4,10 @@ \alias{summary.brokenstickmodel} \title{S3 methods for getting model fit summary for glmfit class object} \usage{ -\method{summary}{brokenstickmodel}(model.fit, format_table = TRUE, ...) +\method{summary}{brokenstickmodel}(object, format_table = TRUE, ...) } \arguments{ -\item{model.fit}{The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm} +\item{object}{The model fit object produced by functions stremr:::glmfit.glm or stremr:::glmfit.speedglm} \item{format_table}{Format the coefficients into a data.frame table?} diff --git a/man/summary.xgb.Booster.Rd b/man/summary.xgb.Booster.Rd index e10d90a..3cab087 100644 --- a/man/summary.xgb.Booster.Rd +++ b/man/summary.xgb.Booster.Rd @@ -5,12 +5,12 @@ \alias{summary.xgb.cv.synchronous} \title{S3 methods for fit summary from xgboost} \usage{ -\method{summary}{xgb.Booster}(xgb.model, ...) +\method{summary}{xgb.Booster}(object, ...) -\method{summary}{xgb.cv.synchronous}(xgb.model, ...) +\method{summary}{xgb.cv.synchronous}(object, ...) } \arguments{ -\item{xgb.model}{The model fit object produced by xgboost (and extracted with \code{getmodel_byname}).} +\item{object}{The model fit object produced by xgboost (and extracted with \code{getmodel_byname}).} \item{...}{Additional options (not used)} } diff --git a/man/xgb.grid.Rd b/man/xgb.grid.Rd new file mode 100644 index 0000000..bfc2045 --- /dev/null +++ b/man/xgb.grid.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ModelXGBoostGridLearner.R +\name{xgb.grid} +\alias{xgb.grid} +\title{Hyper-parameter grid search for xgboost} +\usage{ +xgb.grid(param_grid, data, nrounds, nfold, label = NULL, missing = NA, + prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, + feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE, + early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), + search_criteria, seed = NULL, order_metric_name = NULL, + validation_data = NULL, ...) +} +\arguments{ +\item{param_grid}{A named list with xgboost parameter names, consisting of vectors of hyper-parameter values. +The dataset containing the grid of possible hyper-parameters for model +training is formed internally by running \code{purrr::cross_d(param_grid)}.} + +\item{data}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{nrounds}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{nfold}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{label}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{missing}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{prediction}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{showsd}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{metrics}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{obj}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{feval}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{stratified}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{folds}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{verbose}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{early_stopping_rounds}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{maximize}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{callbacks}{Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} + +\item{search_criteria}{Define how to search over the grid of hyper-parameters. +This should be the list with parameters controlling the grid search. +Currently supported parameters are: \code{'strategy'} and \code{'max_models'}. +Currently supported values for \code{strategy} are \code{'Cartesian'} (covers the entire space of hyper-parameter combinations) or +\code{'RandomDiscrete'} (do a random search of all the combinations of hyper-parameters). +\code{'max_models'} parameter can be set to an integer >0 that defines the maximum number of models to be trained.} + +\item{seed}{Specify the seed to use for determining the random model order in random grid search.} + +\item{order_metric_name}{What is the name of the metric for ranking the final grid of model fits?} + +\item{validation_data}{Validation data to score the model performance while training with \code{xgboost::xgb.train}. +Must be in the same format as \code{data}, see \code{?xgboost::xgb.train} for additional information.} + +\item{...}{Other parameters passed on directly to either \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.} +} +\value{ +A resulting grid search of model object fits in a form of a \code{data.table} with \code{xgboost} model fit +objects saved in a list column named \code{'xgb_fit'}. +In addition, the output \code{data.table} contains the original hyper-parameters used as well as the model +performance metrics assessed by \code{xgboost}. The dataset is sorted according to the \code{order_metric_name}. +} +\description{ +Performing simple hyper-parameter grid search for xgboost. Model scoring can be +done either with validation data or with V-fold cross-validation. +} +\author{ +The code for using \code{tidyverse} syntax for model grid search is borrowed and adapted from: +\href{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}. +The \code{search_criteria} idea is borrowed from \code{h2o::h2o.grid}. +} + diff --git a/tests/RUnit/RUnit_tests_02b_SLexample_xgboost.R b/tests/RUnit/RUnit_tests_02b_SLexample_xgboost.R index 46b92b9..71be4ba 100644 --- a/tests/RUnit/RUnit_tests_02b_SLexample_xgboost.R +++ b/tests/RUnit/RUnit_tests_02b_SLexample_xgboost.R @@ -33,11 +33,15 @@ test.xgb.grid.printing <- function() { } } + plotMSEs(xgboost_holdout, interactive = TRUE) + make_model_report(xgboost_holdout, data = cpp_holdout, K = 10, file.name = paste0("GLMs_", getOption("gridisl.file.name")), title = paste0("Growth Curve Imputation with GLM"), - format = "html", keep_md = TRUE, openFile = TRUE) + format = "html", keep_md = TRUE, + # openFile = TRUE) + openFile = FALSE) }