-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmda.wmshap.table.Rd
149 lines (128 loc) · 5.36 KB
/
hmda.wmshap.table.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmda.wmshap.table.R
\name{hmda.wmshap.table}
\alias{hmda.wmshap.table}
\title{Create SHAP Summary Table Based on the Given Criterion}
\usage{
hmda.wmshap.table(
wmshap,
method = c("mean"),
cutoff = 0.01,
round = 3,
exclude_features = NULL,
dict = dictionary(raw, attribute = "label"),
markdown.table = TRUE,
split.tables = 120,
split.cells = 50
)
}
\arguments{
\item{wmshap}{A wmshap object, returned by the hmda.wmshap function
containing a data frame \code{summaryShaps}.}
\item{method}{Character. Specify the method for selecting important features
based on their weighted mean SHAP ratios. The default is
\code{"mean"}, which selects features whose weighted mean shap ratio (WMSHAP)
exceeds the \code{cutoff}. The alternative is
\code{"lowerCI"}, which selects features whose lower bound of confidence
interval exceeds the \code{cutoff}.}
\item{cutoff}{Numeric. The threshold cutoff for the selection method;
only features with a value in the \code{method} column
greater than or equal to this value are retained.
Default is \code{0.01}.}
\item{round}{Integer. The number of decimal places to round the
SHAP mean and confidence interval values. Default is
\code{3}.}
\item{exclude_features}{Character vector. A vector of feature names to be
excluded from the summary table. Default is \code{NULL}.}
\item{dict}{A data frame containing at least two columns named
\code{"name"} and \code{"description"}. If provided, the
function uses this dictionary to add human-readable feature
descriptions. Default is \code{NULL}.}
\item{markdown.table}{Logical. If \code{TRUE}, the output is formatted as a
markdown table using the \pkg{pander} package; otherwise, a
data frame is returned. Default is \code{TRUE}.}
\item{split.tables}{Integer. Controls table splitting in \code{pander()}.
Default is \code{120}.}
\item{split.cells}{Integer. Controls cell splitting in \code{pander()}.
Default is \code{50}.}
}
\value{
If \code{markdown.table = TRUE}, returns a markdown table (invisibly)
showing two columns: \code{"Description"} and \code{"WMSHAP"}. If
\code{markdown.table = FALSE}, returns a data frame with these columns.
}
\description{
Generates a summary table of weighted mean SHAP (WMSHAP) values
and confidence intervals for each feature based on a weighted SHAP analysis.
The function filters the SHAP summary table (from a \code{wmshap} object) by
selecting features that meet or exceed a specified cutoff using a selection
method (default "mean"). It then sorts the table by the mean SHAP value,
formats the SHAP values along with their 95\% confidence intervals into a single
string, and optionally adds human-readable feature descriptions from a provided
dictionary. The output is returned as a markdown table using the \pkg{pander}
package, or as a data frame if requested.
}
\examples{
\dontrun{
library(HMDA)
library(h2o)
hmda.init()
# Import a sample binary outcome dataset into H2O
train <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)
# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])
params <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0)
)
# Train and validate a cartesian grid of GBMs
hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
grid_id = "hmda_grid1",
training_frame = train,
nfolds = 10,
ntrees = 100,
seed = 1,
hyper_params = gbm_params1)
# Assess the performances of the models
grid_performance <- hmda.grid.analysis(hmda_grid1)
# Return the best 2 models according to each metric
hmda.best.models(grid_performance, n_models = 2)
# build an autoEnsemble model & test it with the testing dataset
meta <- hmda.autoEnsemble(models = hmda_grid1, training_frame = train)
print(h2o.performance(model = meta$model, newdata = test))
# compute weighted mean shap values
wmshap <- hmda.wmshap(models = hmda_grid1,
newdata = test,
performance_metric = "aucpr",
standardize_performance_metric = FALSE,
performance_type = "xval",
minimum_performance = 0,
method = "mean",
cutoff = 0.01,
plot = TRUE)
# identify the important features
selected <- hmda.feature.selection(wmshap,
method = c("mean"),
cutoff = 0.01)
print(selected)
# View the plot of weighted mean SHAP values and confidence intervals
print(wmshap$plot)
# get the wmshap table output in Markdown format:
md_table <- shapley.table(wmshap = wmshap,
method = "mean",
cutoff = 0.01,
round = 3,
markdown.table = TRUE)
head(md_table)
}
}
\author{
E. F. Haghish
}