-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmda.grid.Rd
137 lines (117 loc) · 4.52 KB
/
hmda.grid.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.grid.R
\name{hmda.grid}
\alias{hmda.grid}
\title{Tune Hyperparameter Grid for HMDA Framework}
\usage{
hmda.grid(
algorithm = c("drf", "gbm"),
grid_id = NULL,
x,
y,
training_frame = h2o.getFrame("hmda.train.hex"),
validation_frame = NULL,
hyper_params = list(),
nfolds = 10,
seed = NULL,
keep_cross_validation_predictions = TRUE,
recovery_dir = NULL,
sort_by = "logloss",
...
)
}
\arguments{
\item{algorithm}{Character. The algorithm to tune. Supported values
are "drf" (Distributed Random Forest) and "gbm"
(Gradient Boosting Machine). Only one algorithm
can be specified. (Case-insensitive)}
\item{grid_id}{Character. Optional identifier for the grid search.
If \code{NULL}, an automatic grid_id is generated
using the algorithm name and the current time.}
\item{x}{Vector. Predictor column names or indices.}
\item{y}{Character. The response column name or index.}
\item{training_frame}{An H2OFrame containing the training data.
Default is \code{h2o.getFrame("hmda.train.hex")}.}
\item{validation_frame}{An H2OFrame for early stopping. Default is \code{NULL}.}
\item{hyper_params}{List. A list of hyperparameter vectors for tuning.
If you do not have a clue about how to specify the
hyperparameters, consider consulting \code{hmda.suggest.param}
and \code{hmda.search.param} functions, which provide
suggestions based on default values or random search.}
\item{nfolds}{Integer. Number of folds for cross-validation.
Default is 10.}
\item{seed}{Integer. A seed for reproducibility.
Default is \code{NULL}.}
\item{keep_cross_validation_predictions}{Logical. Whether to keep
cross-validation predictions. Default is \code{TRUE}.}
\item{recovery_dir}{Character. Directory path to save the grid search
output. If provided, the grid is saved using
\code{h2o.saveGrid()}.}
\item{sort_by}{Character. Metric used to sort the grid. Default is "logloss".}
\item{...}{Additional arguments passed to \code{h2o.grid()}.}
}
\value{
An object of class \code{H2OGrid} containing the grid search
results.
}
\description{
Generates a hyperparameter grid for a single tree-based
algorithm (either "drf" or "gbm") by running a grid search.
The function validates inputs, generates an
automatic grid ID for the grid (if not provided), and optionally
saves the grid to a recovery directory. The resulting grid object
contains all trained models and can be used for further analysis.
For scientific computing, saving the grid is highly recommended
to avoid future re-running the training!
}
\details{
The function executes the following steps:
\enumerate{
\item \strong{Input Validation:} Ensures only one algorithm is specified
and verifies that the training frame is an H2OFrame.
\item \strong{Grid ID Generation:} If no \code{grid_id} is provided, it
creates one using the algorithm name and the current time.
\item \strong{Grid Search Execution:} Calls \code{h2o.grid()} with the
provided hyperparameters and cross-validation settings.
\item \strong{Grid Saving:} If a recovery directory is specified, the grid
is saved to disk using \code{h2o.saveGrid()}.
}
The output is an H2O grid object that contains all the trained models.
}
\examples{
\dontrun{
library(HMDA)
library(h2o)
hmda.init()
# Import a sample binary outcome dataset into H2O
train <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)
# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])
params <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0)
)
# Train and validate a cartesian grid of GBMs
hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
grid_id = "hmda_grid1",
training_frame = train,
nfolds = 10,
ntrees = 100,
seed = 1,
hyper_params = gbm_params1)
# Assess the performances of the models
grid_performance <- hmda.grid.analysis(hmda_grid1)
# Return the best 2 models according to each metric
hmda.best.models(grid_performance, n_models = 2)
}
}
\author{
E. F. Haghish
}