adding docs for xgb.grid and clean-up R checks

osofr · Jan 29, 2017 · c7aacce · c7aacce
1 parent e904b3f
commit c7aacce
Show file tree

Hide file tree

Showing 34 changed files with 309 additions and 738 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,7 +4,6 @@ S3method("+",ModelStack)
 S3method(fit,ModelStack)
 S3method(fit,brokenstick)
 S3method(fit,face)
-S3method(fit,h2ogrid)
 S3method(fit,xgb.grid)
 S3method(newFitModel,brokenstick)
 S3method(newFitModel,face)
@@ -51,18 +50,12 @@ export(eval_MSE)
 export(faceModelClass)
 export(fit)
 export(fit_model)
-export(fit_single_h2o_grid)
 export(fit_single_xgboost_grid)
 export(get_out_of_sample_predictions)
 export(get_train_data)
 export(get_validation_data)
 export(glmModelClass)
 export(gridislOptions)
-export(h2o.deeplearning.wrapper)
-export(h2o.example.wrapper)
-export(h2o.gbm.wrapper)
-export(h2o.glm.wrapper)
-export(h2o.randomForest.wrapper)
 export(make_PredictionStack)
 export(make_model_report)
 export(newFitModel)
@@ -77,6 +70,8 @@ export(set_all_gridisl_options)
 export(xgb.grid)
 import(R6)
 import(data.table)
+import(ggiraph)
+import(ggplot2)
 importFrom(Rcpp,sourceCpp)
 importFrom(assertthat,assert_that)
 importFrom(assertthat,is.count)

diff --git a/R/ModelGLMsClass.R b/R/ModelGLMsClass.R
@@ -19,7 +19,7 @@ fit.glm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) {
     SuppressGivenWarnings({
       model.fit <- stats::glm.fit(x = Xmat,
                                   y = Yvals,
-                                  family = quasibinomial() ,
+                                  family = stats::quasibinomial() ,
                                   control = ctrl)
     }, GetWarningsToSuppress())
   }
@@ -54,7 +54,7 @@ fit.speedglm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) {
     model.fit <- try(speedglm::speedglm.wfit(X = Xmat,
                                              y = Yvals,
                                              method = 'Cholesky',
-                                             family = quasibinomial(),
+                                             family = stats::quasibinomial(),
                                              trace = FALSE),
                     silent = TRUE)
     }, GetWarningsToSuppress())
@@ -121,7 +121,7 @@ predictP1.GLMmodel <- function(m.fit, ParentObject, DataStorageObject, subset_id
 #' }
 #' @section Methods:
 #' \describe{
-#'   \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a
+#'   \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a
 #'   design matrix and binary outcome.}
 #'   \item{\code{setdata()}}{...}
 #' }

diff --git a/R/ModelH2OGridLearner.R b/R/ModelH2OGridLearner.R
@@ -1,49 +1,5 @@
-# # default metalearner (NN least squares)
-# h2o.glm_nn <- function(..., non_negative = TRUE) h2o.glm.wrapper(..., non_negative = non_negative)
-
-# # Train a model using a single h2o learner (user-spec) with cross-validation & keep CV predictions
-# # @export
-# fit_single_h2o_learner <- function(learner, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame = NULL, ...) {
-#   if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress()
-#   learner_fun <- match.fun(learner)
-
-#   mainArgs <- list(y = y, training_frame = training_frame, family = family,
-#                    keep_cross_validation_predictions = TRUE,
-#                    keep_cross_validation_fold_assignment = TRUE)
-
-#   if (!missing(fold_column)) {
-#     if (!is.null(fold_column) && is.character(fold_column) && (fold_column != "")) {
-#       mainArgs$fold_column <- fold_column
-#     }
-#   }
-
-#   mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = learner_fun)
-
-#   if (!is.null(validation_frame)) mainArgs$validation_frame <- validation_frame
-
-#   if (("x" %in% names(formals(learner))) && (as.character(formals(learner)$x)[1] != "")) {
-#     # Special case where we pass a subset of the colnames, x, in a custom learner function wrapper
-#     # model_fit <- learner_fun(y = y, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE)
-#   } else {
-#     # Use all predictors in training set for training
-#     mainArgs$x <- x
-#     # model_fit <- learner_fun(y = y, x = x, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE)
-#   }
-
-#   model_fit <- do.call(learner_fun, mainArgs)
-
-#   fit <- vector(mode = "list")
-#   fit$fitfunname <- learner;
-#   if (gvars$verbose) {
-#     print("grid search fitted models:"); print(model_fit)
-#   }
-#   fit$model_fit <- model_fit
-#   class(fit) <- c(class(fit)[1], c("H2Omodel"))
-#   return(fit)
-# }
-
-
-#' @export
+
+
 fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame  = NULL, ...) {
   if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress()
   mainArgs <- list(x = x, y = y, training_frame = training_frame,
@@ -81,7 +37,7 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "
 
   ## doesn't work if h2o namespace is not loaded:
   # algo_fun <- get0(algo_fun_name, mode = "function", inherits = TRUE)
-  algo_fun <- getFromNamespace(algo_fun_name, ns='h2o')
+  algo_fun <- utils::getFromNamespace(algo_fun_name, ns='h2o')
 
   mainArgs <- keep_only_fun_args(mainArgs, fun = algo_fun)   # Keep only the relevant args in mainArgs list:
   mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = algo_fun) # Add user args that pertain to this specific learner:
@@ -120,7 +76,6 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "
 
 }
 
-#' @export
 fit.h2ogrid <- function(fit.class, params, training_frame, y, x, model_contrl, fold_column, ...) {
   family <- model_contrl[["family"]]
   if (is.null(family)) family <- "binomial"

diff --git a/R/ModelH2OOutOfSamplePrediction.R b/R/ModelH2OOutOfSamplePrediction.R
@@ -1,5 +1,5 @@
 check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) {
-  all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment)
+  all_folds_h2o <- lapply(models_list, function(x) h2o::h2o.cross_validation_fold_assignment(x))
   train_frame_ID_1 <- models_list[[1]]@parameters$training_frame
 
   if (length(all_folds_h2o) > 1) {
@@ -128,7 +128,7 @@ predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subse
         if (ncol(newpreds) > 1) newpreds <- newpreds[["p1"]]
         newpreds_prev_CV_i <- h2o::h2o.cbind(newpreds_prev_CV_i, newpreds)
       }
-      newpreds_prev_CV_i <- h2o::h2o.cbind(h2o.which(fold_CV_i_logical), newpreds_prev_CV_i)
+      newpreds_prev_CV_i <- h2o::h2o.cbind(h2o::h2o.which(fold_CV_i_logical), newpreds_prev_CV_i)
       pAoutMat_h2o <- h2o::h2o.rbind(pAoutMat_h2o, newpreds_prev_CV_i)
     }
 

diff --git a/R/ModelPredictionStack.R b/R/ModelPredictionStack.R
@@ -1,13 +1,11 @@
 #' S3 methods for printing model fit summary for PredictionModel R6 class object
 #'
 #' Prints the modeling summaries
-#' @param modelstack The model fit object produced by functions \code{make_PredictionStack}.
-#' @param model_stats Also print some model summaries?
-#' @param all_fits Print all of the modeling fits contained in this object? Warning: this may produce a lot of output!
+#' @param x The model fit object produced by functions \code{make_PredictionStack}.
 #' @param ... Additional options passed on to \code{print.PredictionModel}.
 #' @export
-print.PredictionStack <- function(modelstack, model_stats = FALSE, all_fits = FALSE, ...) {
-  modelstack$show(model_stats = model_stats, all_fits = all_fits)
+print.PredictionStack <- function(x, ...) {
+  x$show(...)
   return(invisible(NULL))
 }
 

diff --git a/R/ModelXGBoostClass.R b/R/ModelXGBoostClass.R
@@ -349,7 +349,7 @@ predictP1.XGBoostmodel <- function(m.fit, ParentObject, DataStorageObject, subse
 #' }
 #' @section Methods:
 #' \describe{
-#'   \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a
+#'   \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a
 #'   design matrix and binary outcome.}
 #'   \item{\code{setdata()}}{...}
 #' }

diff --git a/R/ModelXGBoostGridLearner.R b/R/ModelXGBoostGridLearner.R
@@ -1,6 +1,12 @@
+
+
+if(getRversion() >= "2.15.1") {
+  utils::globalVariables(c("xgb_fit", "niter"))
+}
+
 ## custom MSE error evaluation function. Averages the subject level MSE first, then averages across subjects
 evalMSEerror_byID <- function(preds, data) {
-  labels <- getinfo(data, "label")
+  labels <- xgboost::getinfo(data, "label")
   # The usual RMSE (based on rows in the dataset)
   # err = sqrt(as.numeric(sum((labels - preds)^2))/length(labels))
   # RMSE based on the average MSE across subjects first
@@ -15,8 +21,47 @@ evalMSEerror_byID <- function(preds, data) {
   return(list(metric = "RMSE", value = err))
 }
 
-## Peforming simple hyperparameter grid search for xgboost with cross-validation
-## Relies on tidyverse syntax and borrowed from: https://drsimonj.svbtle.com/grid-search-in-the-tidyverse
+#' Hyper-parameter grid search for xgboost
+#'
+#' Performing simple hyper-parameter grid search for xgboost. Model scoring can be
+#' done either with validation data or with V-fold cross-validation.
+#' @param param_grid A named list with xgboost parameter names, consisting of vectors of hyper-parameter values.
+#' The dataset containing the grid of possible hyper-parameters for model
+#' training is formed internally by running \code{purrr::cross_d(param_grid)}.
+#' @param data Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param nrounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param nfold Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param label Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param missing Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param prediction Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param showsd Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param metrics Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param obj Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param feval  Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param stratified Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param folds  Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param verbose  Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param early_stopping_rounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param maximize Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param callbacks Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @param search_criteria Define how to search over the grid of hyper-parameters.
+#' This should be the list with parameters controlling the grid search.
+#' Currently supported parameters are: \code{'strategy'} and \code{'max_models'}.
+#' Currently supported values for \code{strategy} are \code{'Cartesian'} (covers the entire space of hyper-parameter combinations) or
+#' \code{'RandomDiscrete'} (do a random search of all the combinations of hyper-parameters).
+#' \code{'max_models'} parameter can be set to an integer >0 that defines the maximum number of models to be trained.
+#' @param seed Specify the seed to use for determining the random model order in random grid search.
+#' @param order_metric_name What is the name of the metric for ranking the final grid of model fits?
+#' @param validation_data Validation data to score the model performance while training with \code{xgboost::xgb.train}.
+#' Must be in the same format as \code{data}, see \code{?xgboost::xgb.train} for additional information.
+#' @param ... Other parameters passed on directly to either \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
+#' @return A resulting grid search of model object fits in a form of a \code{data.table} with \code{xgboost} model fit
+#' objects saved in a list column named \code{'xgb_fit'}.
+#' In addition, the output \code{data.table} contains the original hyper-parameters used as well as the model
+#' performance metrics assessed by \code{xgboost}. The dataset is sorted according to the \code{order_metric_name}.
+#' @author The code for using \code{tidyverse} syntax for model grid search is borrowed and adapted from:
+#' \href{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}.
+#' The \code{search_criteria} idea is borrowed from \code{h2o::h2o.grid}.
 #' @export
 # print_every_n = 1L,
 xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = NA,
@@ -132,13 +177,12 @@ xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = N
   ## ------------------------------------------------------------
   ## Sequentially fit each model from the grid
   ## ------------------------------------------------------------
+  gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model))
   # data.table::setDT(gs)
   # for (i in 1:nrow(gs)) {
   #   gs[i, xgb_fit := list(list(purrr::lift(run_singe_model)(gs[i, ])))]
   # }
 
-  gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model))
-
   ## ------------------------------------------------------------
   ## TO RUN GRID MODELS IN PARALLEL
   ## ------------------------------------------------------------

diff --git a/R/ModelXGBoostOutOfSamplePrediction.R b/R/ModelXGBoostOutOfSamplePrediction.R
@@ -1,43 +1,11 @@
-# check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) {
-#   all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment)
-#   train_frame_ID_1 <- models_list[[1]]@parameters$training_frame
-
-#   if (length(all_folds_h2o) > 1) {
-#     ## 1. Test that the exactly the same fold assignments were used by all CV models in the ensemble.
-#     for (idx in 2:length(all_folds_h2o) ) {
-#       if (!h2o::h2o.all(all_folds_h2o[[1]]==all_folds_h2o[[idx]])  )
-#         stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignmets of the following CV model do not match to others: " %+% names(models_list)[idx])
-#     }
-
-#     ## 2. Test that same training h2oFrame was used for all models in the ensemble (just in case).
-#     for (idx in 2:length(all_folds_h2o) ) {
-#       if (!all.equal(train_frame_ID_1, models_list[[idx]]@parameters$training_frame))
-#         stop("Out-of-sample (holdout) predictions for new data has failed. It appears that some of the CV models in ensemble used different training frames.")
-#     }
-#   }
-
-#   ## 3. Test that the validation and training data have exactly the same fold assignments (in h2oFrame)
-#   if (!all(valid_H2Oframe[[fold_column]] == all_folds_h2o[[1]]))
-#     stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignments in new data (validation_data) and training data appear to be different.")
-
-#   ## 4a. Test that the new validation data (in h2oFrame) has the same number of observations as the training data
-#   if (!(nrow(valid_H2Oframe) == nrow(h2o::h2o.getFrame(train_frame_ID_1))))
-#     stop("Out-of-sample (holdout) predictions for new data has failed. The number of rows in new data (validation_data) does not match that of the training data.")
-
-#   ## 4b. Test that all predictors are present in the validation data (in h2oFrame)
-#   if (!all(c(predvars,fold_column) %in% colnames(valid_H2Oframe)))
-#     stop("Out-of-sample (holdout) predictions for new data has failed. Some of the predictors were not found in new data (validation_data).")
-#   return(invisible(TRUE))
-# }
-
 ## ----------------------------------------------------------------------------------------------------------------------------------
 ## Evaluate out-of-sample predictions from V cross-validation models, based on new validation_data.
 ## Can be useful for re-scoring the models when the validation data has to change from the training data in V-fold cross-validation.
 ## Will only perform the out-sample predictions for each model V_i
 ## (i.e., predictions in validation_data will be only made for rows that were not used for training the model V_i)
 ## In the end we generate a vector of n=nrow(validation_data) predictions by combining predictions from all models V=(V_1,...,V_v)
 ## This procedure is repeated for each cross-validated model in the ensemble, resulting in a matrix of predictions (n,k),
-## where k is the total number of models trained by this ensemble (with h2o.grid, etc) and is equal to length(models_list)
+## where k is the total number of models trained by this ensemble (with xgb.grid, etc) and is equal to length(models_list)
 ## ----------------------------------------------------------------------------------------------------------------------------------
 xgb_predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subset_idx, predict_model_names, ...) {
   models_list <- m.fit$modelfits_all