man/hmda.autoEnsemble.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.autoEnsemble.R
\name{hmda.autoEnsemble}
\alias{hmda.autoEnsemble}
\title{Build Stacked Ensemble Model Using autoEnsemble R package}
\usage{
hmda.autoEnsemble(
  models,
  training_frame,
  newdata = NULL,
  family = "binary",
  strategy = c("search"),
  model_selection_criteria = c("auc", "aucpr", "mcc", "f2"),
  min_improvement = 1e-05,
  max = NULL,
  top_rank = seq(0.01, 0.99, 0.01),
  stop_rounds = 3,
  reset_stop_rounds = TRUE,
  stop_metric = "auc",
  seed = -1,
  verbatim = FALSE
)
}
\arguments{
\item{models}{A grid object, such as HMDA grid, or a character vector of H2O model IDs.
The \code{h2o.get_ids} function from \pkg{h2otools} can be used to extract model
IDs from grids.}

\item{training_frame}{An H2OFrame (or data frame already uploaded to the H2O server)
that contains the training data used to build the base models.}

\item{newdata}{An H2OFrame (or data frame already uploaded to the H2O server) to be used
for evaluating the ensemble. If not specified, performance on the training data is used
(for instance, cross-validation performance).}

\item{family}{A character string specifying the model family.}

\item{strategy}{A character vector specifying the ensemble strategy. The available
strategy is \code{"search"} (default). The \code{"search"} strategy searches for
the best combination of top-performing diverse models.}

\item{model_selection_criteria}{A character vector specifying the performance metrics
to consider for model selection. The default is \code{c("auc", "aucpr", "mcc", "f2")}.
Other possible criteria include \code{"f1point5"}, \code{"f3"}, \code{"f4"},
\code{"f5"}, \code{"kappa"}, \code{"mean_per_class_error"}, \code{"gini"}, and
\code{"accuracy"}.}

\item{min_improvement}{Numeric. The minimum improvement in the evaluation metric
required to continue the ensemble search.}

\item{max}{Integer. The maximum number of models for each selection criterion.
If \code{NULL}, a default value based on the top rank percentage is used.}

\item{top_rank}{Numeric vector. Specifies the percentage (or percentages) of the
top models that should be considered for ensemble selection. If the strategy is
\code{"search"}, the function searches for the best combination of models from
the top to the bottom ranked; if the strategy is \code{"top"}, only the first value
is used. Default is \code{seq(0.01, 0.99, 0.01)}.}

\item{stop_rounds}{Integer. The number of consecutive rounds with no improvement
in the performance metric before stopping the search.}

\item{reset_stop_rounds}{Logical. If \code{TRUE}, the stopping rounds counter is
reset each time an improvement is observed.}

\item{stop_metric}{Character. The metric used for early stopping; the default is
\code{"auc"}. Other options include \code{"aucpr"} and \code{"mcc"}.}

\item{seed}{Integer. A random seed for reproducibility. Default is \code{-1}.}

\item{verbatim}{Logical. If \code{TRUE}, the function prints additional
progress information for debugging purposes.}
}
\value{
A list containing:
  \describe{
    \item{model}{The ensemble model built by autoEnsemble.}
    \item{top_models}{A data frame of the top-ranked base models that were used
          in building the ensemble.}
  }
}
\description{
This function is a wrapper within the HMDA package that
  builds a stacked ensemble model by combining multiple H2O models. It
  leverages the \pkg{autoEnsemble} package to stack a set of trained models
  (e.g., from HMDA grid) into a stronger meta-learner. For more
  details on autoEnsemble, please see the GitHub repository at
  \url{https://github.com/haghish/autoEnsemble} and the CRAN package of
  autoEnsemble R package.
}
\details{
This wrapper function integrates with the HMDA package workflow to build a
  stacked ensemble model from a set of base H2O models. It calls the
  \code{ensemble()} function from the \pkg{autoEnsemble} package to construct the
  ensemble. The function is designed to work within HMDA's framework, where base
  models are generated via grid search or AutoML. For more details on the autoEnsemble
  approach, see:
  \itemize{
    \item GitHub: \url{https://github.com/haghish/autoEnsemble}
    \item CRAN: \url{https://CRAN.R-project.org/package=autoEnsemble}
  }

  The ensemble strategy \code{"search"} (default) searches for the best combination
  of top-performing and diverse models to improve overall performance. The wrapper
  returns both the final ensemble model and the list of top-ranked models used in the
  ensemble.
}
\examples{
\dontrun{
  library(HMDA)
  library(h2o)
  hmda.init()

  # Import a sample binary outcome dataset into H2O
  train <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
  test <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

  # Identify predictors and response
  y <- "response"
  x <- setdiff(names(train), y)

  # For binary classification, response should be a factor
  train[, y] <- as.factor(train[, y])
  test[, y] <- as.factor(test[, y])

  params <- list(learn_rate = c(0.01, 0.1),
                 max_depth = c(3, 5, 9),
                 sample_rate = c(0.8, 1.0)
  )

  # Train and validate a cartesian grid of GBMs
  hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
                          grid_id = "hmda_grid1",
                          training_frame = train,
                          nfolds = 10,
                          ntrees = 100,
                          seed = 1,
                          hyper_params = gbm_params1)

  # Assess the performances of the models
  grid_performance <- hmda.grid.analysis(hmda_grid1)

  # Return the best 2 models according to each metric
  hmda.best.models(grid_performance, n_models = 2)

  # build an autoEnsemble model & test it with the testing dataset
  meta <- hmda.autoEnsemble(models = hmda_grid1, training_frame = train)
  print(h2o.performance(model = meta$model, newdata = test))
}
}
\author{
E. F. Haghish
}