-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmda.autoEnsemble.Rd
155 lines (133 loc) · 5.93 KB
/
hmda.autoEnsemble.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.autoEnsemble.R
\name{hmda.autoEnsemble}
\alias{hmda.autoEnsemble}
\title{Build Stacked Ensemble Model Using autoEnsemble R package}
\usage{
hmda.autoEnsemble(
models,
training_frame,
newdata = NULL,
family = "binary",
strategy = c("search"),
model_selection_criteria = c("auc", "aucpr", "mcc", "f2"),
min_improvement = 1e-05,
max = NULL,
top_rank = seq(0.01, 0.99, 0.01),
stop_rounds = 3,
reset_stop_rounds = TRUE,
stop_metric = "auc",
seed = -1,
verbatim = FALSE
)
}
\arguments{
\item{models}{A grid object, such as HMDA grid, or a character vector of H2O model IDs.
The \code{h2o.get_ids} function from \pkg{h2otools} can be used to extract model
IDs from grids.}
\item{training_frame}{An H2OFrame (or data frame already uploaded to the H2O server)
that contains the training data used to build the base models.}
\item{newdata}{An H2OFrame (or data frame already uploaded to the H2O server) to be used
for evaluating the ensemble. If not specified, performance on the training data is used
(for instance, cross-validation performance).}
\item{family}{A character string specifying the model family.}
\item{strategy}{A character vector specifying the ensemble strategy. The available
strategy is \code{"search"} (default). The \code{"search"} strategy searches for
the best combination of top-performing diverse models.}
\item{model_selection_criteria}{A character vector specifying the performance metrics
to consider for model selection. The default is \code{c("auc", "aucpr", "mcc", "f2")}.
Other possible criteria include \code{"f1point5"}, \code{"f3"}, \code{"f4"},
\code{"f5"}, \code{"kappa"}, \code{"mean_per_class_error"}, \code{"gini"}, and
\code{"accuracy"}.}
\item{min_improvement}{Numeric. The minimum improvement in the evaluation metric
required to continue the ensemble search.}
\item{max}{Integer. The maximum number of models for each selection criterion.
If \code{NULL}, a default value based on the top rank percentage is used.}
\item{top_rank}{Numeric vector. Specifies the percentage (or percentages) of the
top models that should be considered for ensemble selection. If the strategy is
\code{"search"}, the function searches for the best combination of models from
the top to the bottom ranked; if the strategy is \code{"top"}, only the first value
is used. Default is \code{seq(0.01, 0.99, 0.01)}.}
\item{stop_rounds}{Integer. The number of consecutive rounds with no improvement
in the performance metric before stopping the search.}
\item{reset_stop_rounds}{Logical. If \code{TRUE}, the stopping rounds counter is
reset each time an improvement is observed.}
\item{stop_metric}{Character. The metric used for early stopping; the default is
\code{"auc"}. Other options include \code{"aucpr"} and \code{"mcc"}.}
\item{seed}{Integer. A random seed for reproducibility. Default is \code{-1}.}
\item{verbatim}{Logical. If \code{TRUE}, the function prints additional
progress information for debugging purposes.}
}
\value{
A list containing:
\describe{
\item{model}{The ensemble model built by autoEnsemble.}
\item{top_models}{A data frame of the top-ranked base models that were used
in building the ensemble.}
}
}
\description{
This function is a wrapper within the HMDA package that
builds a stacked ensemble model by combining multiple H2O models. It
leverages the \pkg{autoEnsemble} package to stack a set of trained models
(e.g., from HMDA grid) into a stronger meta-learner. For more
details on autoEnsemble, please see the GitHub repository at
\url{https://github.com/haghish/autoEnsemble} and the CRAN package of
autoEnsemble R package.
}
\details{
This wrapper function integrates with the HMDA package workflow to build a
stacked ensemble model from a set of base H2O models. It calls the
\code{ensemble()} function from the \pkg{autoEnsemble} package to construct the
ensemble. The function is designed to work within HMDA's framework, where base
models are generated via grid search or AutoML. For more details on the autoEnsemble
approach, see:
\itemize{
\item GitHub: \url{https://github.com/haghish/autoEnsemble}
\item CRAN: \url{https://CRAN.R-project.org/package=autoEnsemble}
}
The ensemble strategy \code{"search"} (default) searches for the best combination
of top-performing and diverse models to improve overall performance. The wrapper
returns both the final ensemble model and the list of top-ranked models used in the
ensemble.
}
\examples{
\dontrun{
library(HMDA)
library(h2o)
hmda.init()
# Import a sample binary outcome dataset into H2O
train <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)
# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])
params <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0)
)
# Train and validate a cartesian grid of GBMs
hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
grid_id = "hmda_grid1",
training_frame = train,
nfolds = 10,
ntrees = 100,
seed = 1,
hyper_params = gbm_params1)
# Assess the performances of the models
grid_performance <- hmda.grid.analysis(hmda_grid1)
# Return the best 2 models according to each metric
hmda.best.models(grid_performance, n_models = 2)
# build an autoEnsemble model & test it with the testing dataset
meta <- hmda.autoEnsemble(models = hmda_grid1, training_frame = train)
print(h2o.performance(model = meta$model, newdata = test))
}
}
\author{
E. F. Haghish
}