Skip to content

Commit

Permalink
adding docs for xgb.grid and clean-up R checks
Browse files Browse the repository at this point in the history
  • Loading branch information
osofr committed Jan 29, 2017
1 parent e904b3f commit c7aacce
Show file tree
Hide file tree
Showing 34 changed files with 309 additions and 738 deletions.
9 changes: 2 additions & 7 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ S3method("+",ModelStack)
S3method(fit,ModelStack)
S3method(fit,brokenstick)
S3method(fit,face)
S3method(fit,h2ogrid)
S3method(fit,xgb.grid)
S3method(newFitModel,brokenstick)
S3method(newFitModel,face)
Expand Down Expand Up @@ -51,18 +50,12 @@ export(eval_MSE)
export(faceModelClass)
export(fit)
export(fit_model)
export(fit_single_h2o_grid)
export(fit_single_xgboost_grid)
export(get_out_of_sample_predictions)
export(get_train_data)
export(get_validation_data)
export(glmModelClass)
export(gridislOptions)
export(h2o.deeplearning.wrapper)
export(h2o.example.wrapper)
export(h2o.gbm.wrapper)
export(h2o.glm.wrapper)
export(h2o.randomForest.wrapper)
export(make_PredictionStack)
export(make_model_report)
export(newFitModel)
Expand All @@ -77,6 +70,8 @@ export(set_all_gridisl_options)
export(xgb.grid)
import(R6)
import(data.table)
import(ggiraph)
import(ggplot2)
importFrom(Rcpp,sourceCpp)
importFrom(assertthat,assert_that)
importFrom(assertthat,is.count)
Expand Down
6 changes: 3 additions & 3 deletions R/ModelGLMsClass.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ fit.glm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) {
SuppressGivenWarnings({
model.fit <- stats::glm.fit(x = Xmat,
y = Yvals,
family = quasibinomial() ,
family = stats::quasibinomial() ,
control = ctrl)
}, GetWarningsToSuppress())
}
Expand Down Expand Up @@ -54,7 +54,7 @@ fit.speedglm <- function(fit.class, params, Xmat, Yvals, model_contrl, ...) {
model.fit <- try(speedglm::speedglm.wfit(X = Xmat,
y = Yvals,
method = 'Cholesky',
family = quasibinomial(),
family = stats::quasibinomial(),
trace = FALSE),
silent = TRUE)
}, GetWarningsToSuppress())
Expand Down Expand Up @@ -121,7 +121,7 @@ predictP1.GLMmodel <- function(m.fit, ParentObject, DataStorageObject, subset_id
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a
#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a
#' design matrix and binary outcome.}
#' \item{\code{setdata()}}{...}
#' }
Expand Down
51 changes: 3 additions & 48 deletions R/ModelH2OGridLearner.R
Original file line number Diff line number Diff line change
@@ -1,49 +1,5 @@
# # default metalearner (NN least squares)
# h2o.glm_nn <- function(..., non_negative = TRUE) h2o.glm.wrapper(..., non_negative = non_negative)

# # Train a model using a single h2o learner (user-spec) with cross-validation & keep CV predictions
# # @export
# fit_single_h2o_learner <- function(learner, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame = NULL, ...) {
# if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress()
# learner_fun <- match.fun(learner)

# mainArgs <- list(y = y, training_frame = training_frame, family = family,
# keep_cross_validation_predictions = TRUE,
# keep_cross_validation_fold_assignment = TRUE)

# if (!missing(fold_column)) {
# if (!is.null(fold_column) && is.character(fold_column) && (fold_column != "")) {
# mainArgs$fold_column <- fold_column
# }
# }

# mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = learner_fun)

# if (!is.null(validation_frame)) mainArgs$validation_frame <- validation_frame

# if (("x" %in% names(formals(learner))) && (as.character(formals(learner)$x)[1] != "")) {
# # Special case where we pass a subset of the colnames, x, in a custom learner function wrapper
# # model_fit <- learner_fun(y = y, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE)
# } else {
# # Use all predictors in training set for training
# mainArgs$x <- x
# # model_fit <- learner_fun(y = y, x = x, training_frame = training_frame, validation_frame = NULL, family = family, fold_column = fold_column, keep_cross_validation_folds = TRUE)
# }

# model_fit <- do.call(learner_fun, mainArgs)

# fit <- vector(mode = "list")
# fit$fitfunname <- learner;
# if (gvars$verbose) {
# print("grid search fitted models:"); print(model_fit)
# }
# fit$model_fit <- model_fit
# class(fit) <- c(class(fit)[1], c("H2Omodel"))
# return(fit)
# }


#' @export


fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "binomial", model_contrl, fold_column, validation_frame = NULL, ...) {
if (gvars$verbose) h2o::h2o.show_progress() else h2o::h2o.no_progress()
mainArgs <- list(x = x, y = y, training_frame = training_frame,
Expand Down Expand Up @@ -81,7 +37,7 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "

## doesn't work if h2o namespace is not loaded:
# algo_fun <- get0(algo_fun_name, mode = "function", inherits = TRUE)
algo_fun <- getFromNamespace(algo_fun_name, ns='h2o')
algo_fun <- utils::getFromNamespace(algo_fun_name, ns='h2o')

mainArgs <- keep_only_fun_args(mainArgs, fun = algo_fun) # Keep only the relevant args in mainArgs list:
mainArgs <- replace_add_user_args(mainArgs, model_contrl, fun = algo_fun) # Add user args that pertain to this specific learner:
Expand Down Expand Up @@ -120,7 +76,6 @@ fit_single_h2o_grid <- function(grid.algorithm, training_frame, y, x, family = "

}

#' @export
fit.h2ogrid <- function(fit.class, params, training_frame, y, x, model_contrl, fold_column, ...) {
family <- model_contrl[["family"]]
if (is.null(family)) family <- "binomial"
Expand Down
4 changes: 2 additions & 2 deletions R/ModelH2OOutOfSamplePrediction.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) {
all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment)
all_folds_h2o <- lapply(models_list, function(x) h2o::h2o.cross_validation_fold_assignment(x))
train_frame_ID_1 <- models_list[[1]]@parameters$training_frame

if (length(all_folds_h2o) > 1) {
Expand Down Expand Up @@ -128,7 +128,7 @@ predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subse
if (ncol(newpreds) > 1) newpreds <- newpreds[["p1"]]
newpreds_prev_CV_i <- h2o::h2o.cbind(newpreds_prev_CV_i, newpreds)
}
newpreds_prev_CV_i <- h2o::h2o.cbind(h2o.which(fold_CV_i_logical), newpreds_prev_CV_i)
newpreds_prev_CV_i <- h2o::h2o.cbind(h2o::h2o.which(fold_CV_i_logical), newpreds_prev_CV_i)
pAoutMat_h2o <- h2o::h2o.rbind(pAoutMat_h2o, newpreds_prev_CV_i)
}

Expand Down
8 changes: 3 additions & 5 deletions R/ModelPredictionStack.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#' S3 methods for printing model fit summary for PredictionModel R6 class object
#'
#' Prints the modeling summaries
#' @param modelstack The model fit object produced by functions \code{make_PredictionStack}.
#' @param model_stats Also print some model summaries?
#' @param all_fits Print all of the modeling fits contained in this object? Warning: this may produce a lot of output!
#' @param x The model fit object produced by functions \code{make_PredictionStack}.
#' @param ... Additional options passed on to \code{print.PredictionModel}.
#' @export
print.PredictionStack <- function(modelstack, model_stats = FALSE, all_fits = FALSE, ...) {
modelstack$show(model_stats = model_stats, all_fits = all_fits)
print.PredictionStack <- function(x, ...) {
x$show(...)
return(invisible(NULL))
}

Expand Down
2 changes: 1 addition & 1 deletion R/ModelXGBoostClass.R
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ predictP1.XGBoostmodel <- function(m.fit, ParentObject, DataStorageObject, subse
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{\link{RegressionClass}} object to instantiate a new storage container for a
#' \item{\code{new(reg)}}{Uses \code{reg} R6 \code{RegressionClass} object to instantiate a new storage container for a
#' design matrix and binary outcome.}
#' \item{\code{setdata()}}{...}
#' }
Expand Down
54 changes: 49 additions & 5 deletions R/ModelXGBoostGridLearner.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@


if(getRversion() >= "2.15.1") {
utils::globalVariables(c("xgb_fit", "niter"))
}

## custom MSE error evaluation function. Averages the subject level MSE first, then averages across subjects
evalMSEerror_byID <- function(preds, data) {
labels <- getinfo(data, "label")
labels <- xgboost::getinfo(data, "label")
# The usual RMSE (based on rows in the dataset)
# err = sqrt(as.numeric(sum((labels - preds)^2))/length(labels))
# RMSE based on the average MSE across subjects first
Expand All @@ -15,8 +21,47 @@ evalMSEerror_byID <- function(preds, data) {
return(list(metric = "RMSE", value = err))
}

## Peforming simple hyperparameter grid search for xgboost with cross-validation
## Relies on tidyverse syntax and borrowed from: https://drsimonj.svbtle.com/grid-search-in-the-tidyverse
#' Hyper-parameter grid search for xgboost
#'
#' Performing simple hyper-parameter grid search for xgboost. Model scoring can be
#' done either with validation data or with V-fold cross-validation.
#' @param param_grid A named list with xgboost parameter names, consisting of vectors of hyper-parameter values.
#' The dataset containing the grid of possible hyper-parameters for model
#' training is formed internally by running \code{purrr::cross_d(param_grid)}.
#' @param data Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param nrounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param nfold Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param label Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param missing Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param prediction Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param showsd Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param metrics Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param obj Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param feval Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param stratified Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param folds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param verbose Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param early_stopping_rounds Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param maximize Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param callbacks Same as in \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @param search_criteria Define how to search over the grid of hyper-parameters.
#' This should be the list with parameters controlling the grid search.
#' Currently supported parameters are: \code{'strategy'} and \code{'max_models'}.
#' Currently supported values for \code{strategy} are \code{'Cartesian'} (covers the entire space of hyper-parameter combinations) or
#' \code{'RandomDiscrete'} (do a random search of all the combinations of hyper-parameters).
#' \code{'max_models'} parameter can be set to an integer >0 that defines the maximum number of models to be trained.
#' @param seed Specify the seed to use for determining the random model order in random grid search.
#' @param order_metric_name What is the name of the metric for ranking the final grid of model fits?
#' @param validation_data Validation data to score the model performance while training with \code{xgboost::xgb.train}.
#' Must be in the same format as \code{data}, see \code{?xgboost::xgb.train} for additional information.
#' @param ... Other parameters passed on directly to either \code{xgboost::xgb.train} or \code{xgboost::xgb.cv}.
#' @return A resulting grid search of model object fits in a form of a \code{data.table} with \code{xgboost} model fit
#' objects saved in a list column named \code{'xgb_fit'}.
#' In addition, the output \code{data.table} contains the original hyper-parameters used as well as the model
#' performance metrics assessed by \code{xgboost}. The dataset is sorted according to the \code{order_metric_name}.
#' @author The code for using \code{tidyverse} syntax for model grid search is borrowed and adapted from:
#' \href{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}{https://drsimonj.svbtle.com/grid-search-in-the-tidyverse}.
#' The \code{search_criteria} idea is borrowed from \code{h2o::h2o.grid}.
#' @export
# print_every_n = 1L,
xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = NA,
Expand Down Expand Up @@ -132,13 +177,12 @@ xgb.grid <- function(param_grid, data, nrounds, nfold, label = NULL, missing = N
## ------------------------------------------------------------
## Sequentially fit each model from the grid
## ------------------------------------------------------------
gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model))
# data.table::setDT(gs)
# for (i in 1:nrow(gs)) {
# gs[i, xgb_fit := list(list(purrr::lift(run_singe_model)(gs[i, ])))]
# }

gs <- gs %>% dplyr::mutate(xgb_fit = purrr::pmap(gs, run_singe_model))

## ------------------------------------------------------------
## TO RUN GRID MODELS IN PARALLEL
## ------------------------------------------------------------
Expand Down
34 changes: 1 addition & 33 deletions R/ModelXGBoostOutOfSamplePrediction.R
Original file line number Diff line number Diff line change
@@ -1,43 +1,11 @@
# check_out_of_sample_consistency <- function(models_list, valid_H2Oframe, predvars, fold_column) {
# all_folds_h2o <- lapply(models_list, h2o.cross_validation_fold_assignment)
# train_frame_ID_1 <- models_list[[1]]@parameters$training_frame

# if (length(all_folds_h2o) > 1) {
# ## 1. Test that the exactly the same fold assignments were used by all CV models in the ensemble.
# for (idx in 2:length(all_folds_h2o) ) {
# if (!h2o::h2o.all(all_folds_h2o[[1]]==all_folds_h2o[[idx]]) )
# stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignmets of the following CV model do not match to others: " %+% names(models_list)[idx])
# }

# ## 2. Test that same training h2oFrame was used for all models in the ensemble (just in case).
# for (idx in 2:length(all_folds_h2o) ) {
# if (!all.equal(train_frame_ID_1, models_list[[idx]]@parameters$training_frame))
# stop("Out-of-sample (holdout) predictions for new data has failed. It appears that some of the CV models in ensemble used different training frames.")
# }
# }

# ## 3. Test that the validation and training data have exactly the same fold assignments (in h2oFrame)
# if (!all(valid_H2Oframe[[fold_column]] == all_folds_h2o[[1]]))
# stop("Out-of-sample (holdout) predictions for new data has failed. The fold assignments in new data (validation_data) and training data appear to be different.")

# ## 4a. Test that the new validation data (in h2oFrame) has the same number of observations as the training data
# if (!(nrow(valid_H2Oframe) == nrow(h2o::h2o.getFrame(train_frame_ID_1))))
# stop("Out-of-sample (holdout) predictions for new data has failed. The number of rows in new data (validation_data) does not match that of the training data.")

# ## 4b. Test that all predictors are present in the validation data (in h2oFrame)
# if (!all(c(predvars,fold_column) %in% colnames(valid_H2Oframe)))
# stop("Out-of-sample (holdout) predictions for new data has failed. Some of the predictors were not found in new data (validation_data).")
# return(invisible(TRUE))
# }

## ----------------------------------------------------------------------------------------------------------------------------------
## Evaluate out-of-sample predictions from V cross-validation models, based on new validation_data.
## Can be useful for re-scoring the models when the validation data has to change from the training data in V-fold cross-validation.
## Will only perform the out-sample predictions for each model V_i
## (i.e., predictions in validation_data will be only made for rows that were not used for training the model V_i)
## In the end we generate a vector of n=nrow(validation_data) predictions by combining predictions from all models V=(V_1,...,V_v)
## This procedure is repeated for each cross-validated model in the ensemble, resulting in a matrix of predictions (n,k),
## where k is the total number of models trained by this ensemble (with h2o.grid, etc) and is equal to length(models_list)
## where k is the total number of models trained by this ensemble (with xgb.grid, etc) and is equal to length(models_list)
## ----------------------------------------------------------------------------------------------------------------------------------
xgb_predict_out_of_sample_cv <- function(m.fit, ParentObject, validation_data, subset_idx, predict_model_names, ...) {
models_list <- m.fit$modelfits_all
Expand Down
Loading

0 comments on commit c7aacce

Please sign in to comment.