man/hmda.grid.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.grid.R
\name{hmda.grid}
\alias{hmda.grid}
\title{Tune Hyperparameter Grid for HMDA Framework}
\usage{
hmda.grid(
  algorithm = c("drf", "gbm"),
  grid_id = NULL,
  x,
  y,
  training_frame = h2o.getFrame("hmda.train.hex"),
  validation_frame = NULL,
  hyper_params = list(),
  nfolds = 10,
  seed = NULL,
  keep_cross_validation_predictions = TRUE,
  recovery_dir = NULL,
  sort_by = "logloss",
  ...
)
}
\arguments{
\item{algorithm}{Character. The algorithm to tune. Supported values
are "drf" (Distributed Random Forest) and "gbm"
(Gradient Boosting Machine). Only one algorithm
can be specified. (Case-insensitive)}

\item{grid_id}{Character. Optional identifier for the grid search.
If \code{NULL}, an automatic grid_id is generated
using the algorithm name and the current time.}

\item{x}{Vector. Predictor column names or indices.}

\item{y}{Character. The response column name or index.}

\item{training_frame}{An H2OFrame containing the training data.
Default is \code{h2o.getFrame("hmda.train.hex")}.}

\item{validation_frame}{An H2OFrame for early stopping. Default is \code{NULL}.}

\item{hyper_params}{List. A list of hyperparameter vectors for tuning.
If you do not have a clue about how to specify the
hyperparameters, consider consulting \code{hmda.suggest.param}
and \code{hmda.search.param} functions, which provide
suggestions based on default values or random search.}

\item{nfolds}{Integer. Number of folds for cross-validation.
Default is 10.}

\item{seed}{Integer. A seed for reproducibility.
Default is \code{NULL}.}

\item{keep_cross_validation_predictions}{Logical. Whether to keep
cross-validation predictions. Default is \code{TRUE}.}

\item{recovery_dir}{Character. Directory path to save the grid search
output. If provided, the grid is saved using
\code{h2o.saveGrid()}.}

\item{sort_by}{Character. Metric used to sort the grid. Default is "logloss".}

\item{...}{Additional arguments passed to \code{h2o.grid()}.}
}
\value{
An object of class \code{H2OGrid} containing the grid search
        results.
}
\description{
Generates a hyperparameter grid for a single tree-based
  algorithm (either "drf" or "gbm") by running a grid search.
  The function validates inputs, generates an
  automatic grid ID for the grid (if not provided), and optionally
  saves the grid to a recovery directory. The resulting grid object
  contains all trained models and can be used for further analysis.
  For scientific computing, saving the grid is highly recommended
  to avoid future re-running the training!
}
\details{
The function executes the following steps:
  \enumerate{
    \item \strong{Input Validation:} Ensures only one algorithm is specified
          and verifies that the training frame is an H2OFrame.
    \item \strong{Grid ID Generation:} If no \code{grid_id} is provided, it
          creates one using the algorithm name and the current time.
    \item \strong{Grid Search Execution:} Calls \code{h2o.grid()} with the
          provided hyperparameters and cross-validation settings.
    \item \strong{Grid Saving:} If a recovery directory is specified, the grid
          is saved to disk using \code{h2o.saveGrid()}.
  }
  The output is an H2O grid object that contains all the trained models.
}
\examples{
\dontrun{
  library(HMDA)
  library(h2o)
  hmda.init()

  # Import a sample binary outcome dataset into H2O
  train <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
  test <- h2o.importFile(
  "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

  # Identify predictors and response
  y <- "response"
  x <- setdiff(names(train), y)

  # For binary classification, response should be a factor
  train[, y] <- as.factor(train[, y])
  test[, y] <- as.factor(test[, y])

  params <- list(learn_rate = c(0.01, 0.1),
                 max_depth = c(3, 5, 9),
                 sample_rate = c(0.8, 1.0)
  )

  # Train and validate a cartesian grid of GBMs
  hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
                          grid_id = "hmda_grid1",
                          training_frame = train,
                          nfolds = 10,
                          ntrees = 100,
                          seed = 1,
                          hyper_params = gbm_params1)

  # Assess the performances of the models
  grid_performance <- hmda.grid.analysis(hmda_grid1)

  # Return the best 2 models according to each metric
  hmda.best.models(grid_performance, n_models = 2)
}

}
\author{
E. F. Haghish
}