forked from gbm-developers/gbm3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgbm.fit.R
406 lines (362 loc) · 12.9 KB
/
gbm.fit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#' @export gbm.fit
gbm.fit <- function(x,y,
offset = NULL,
misc = NULL,
distribution = "bernoulli",
w = NULL,
var.monotone = NULL,
n.trees = 100,
interaction.depth = 1,
n.minobsinnode = 10,
shrinkage = 0.001,
bag.fraction = 0.5,
nTrain = NULL,
train.fraction = NULL,
mFeatures = NULL,
keep.data = TRUE,
verbose = TRUE,
var.names = NULL,
response.name = "y",
group = NULL){
if(is.character(distribution)) { distribution <- list(name=distribution) }
cRows <- nrow(x)
cCols <- ncol(x)
if(nrow(x) != ifelse("Surv" %in% class(y), nrow(y), length(y))) {
stop("The number of rows in x does not equal the length of y.")
}
# the preferred way to specify the number of training instances is via parameter 'nTrain'.
# parameter 'train.fraction' is only maintained for backward compatibility.
if(!is.null(nTrain) && !is.null(train.fraction)) {
stop("Parameters 'nTrain' and 'train.fraction' cannot both be specified")
}
else if(!is.null(train.fraction)) {
warning("Parameter 'train.fraction' of gbm.fit is deprecated, please specify 'nTrain' instead")
nTrain <- floor(train.fraction*cRows)
}
else if(is.null(nTrain)) {
# both undefined, use all training data
nTrain <- cRows
}
if (is.null(train.fraction)){
train.fraction <- nTrain / cRows
}
if (is.null(mFeatures)) {
mFeatures <- cCols
}
if(is.null(var.names)) {
var.names <- getVarNames(x)
}
# if(is.null(response.name)) { response.name <- "y" }
# check dataset size
if(nTrain * bag.fraction <= 2*n.minobsinnode+1) {
stop("The dataset size is too small or subsampling rate is too large: nTrain*bag.fraction <= n.minobsinnode")
}
if (distribution$name != "pairwise") {
w <- w*length(w)/sum(w) # normalize to N
}
# Do sanity checks
ch <- checkMissing(x, y)
interaction.depth <- checkID(interaction.depth)
w <- checkWeights(w, length(y))
offset <- checkOffset(offset, y, distribution)
Misc <- NA
# setup variable types
var.type <- rep(0, cCols)
var.levels <- vector("list", cCols)
for(i in 1:length(var.type)) {
if(all(is.na(x[,i]))) {
stop("variable ",i,": ",var.names[i]," has only missing values.")
}
if(is.ordered(x[,i])) {
var.levels[[i]] <- levels(factor(x[,i]))
x[,i] <- as.numeric(factor(x[,i]))-1
var.type[i] <- 0
}
else if(is.factor(x[,i])) {
if(length(levels(x[,i]))>1024)
stop("gbm does not currently handle categorical variables with more than 1024 levels. Variable ",i,": ",var.names[i]," has ",length(levels(x[,i]))," levels.")
var.levels[[i]] <- levels(factor(x[,i]))
x[,i] <- as.numeric(factor(x[,i]))-1
var.type[i] <- max(x[,i],na.rm=TRUE)+1
}
else if(is.numeric(x[,i])) {
var.levels[[i]] <- quantile(x[,i],prob=(0:10)/10,na.rm=TRUE)
}
else{
stop("variable ",i,": ",var.names[i]," is not of type numeric, ordered, or factor.")
}
# check for some variation in each variable
if(length(unique(var.levels[[i]])) == 1) {
warning("variable ",i,": ",var.names[i]," has no variation.")
}
}
nClass <- 1
if(!("name" %in% names(distribution))) {
stop("The distribution is missing a 'name' component, for example list(name=\"gaussian\")")
}
supported.distributions <-
c("bernoulli","gaussian","poisson","adaboost","laplace","coxph","quantile",
"tdist", "multinomial", "huberized", "pairwise","gamma","tweedie")
distribution.call.name <- distribution$name
# check potential problems with the distributions
if(!is.element(distribution$name,supported.distributions))
{
stop("Distribution ",distribution$name," is not supported")
}
if((distribution$name == "bernoulli") && !all(is.element(y,0:1)))
{
stop("Bernoulli requires the response to be in {0,1}")
}
if((distribution$name == "huberized") && !all(is.element(y,0:1)))
{
stop("Huberized square hinged loss requires the response to be in {0,1}")
}
if((distribution$name == "poisson") && any(y<0))
{
stop("Poisson requires the response to be positive")
}
if((distribution$name == "gamma") && any(y<0))
{
stop("Gamma requires the response to be positive")
}
if(distribution$name == "tweedie")
{
if(any(y<0))
{
stop("Tweedie requires the response to be positive")
}
if(is.null(distribution$power))
{
distribution$power = 1.5
}
Misc <- c(power=distribution$power)
}
if((distribution$name == "poisson") && any(y != trunc(y)))
{
stop("Poisson requires the response to be a positive integer")
}
if((distribution$name == "adaboost") && !all(is.element(y,0:1)))
{
stop("This version of AdaBoost requires the response to be in {0,1}")
}
if(distribution$name == "quantile")
{
if(is.null(distribution$alpha))
{
stop("For quantile regression, the distribution parameter must be a list with a parameter 'alpha' indicating the quantile, for example list(name=\"quantile\",alpha=0.95).")
} else
if((distribution$alpha<0) || (distribution$alpha>1))
{
stop("alpha must be between 0 and 1.")
}
Misc <- c(alpha=distribution$alpha)
}
if(distribution$name == "coxph")
{
if(class(y)!="Surv")
{
stop("Outcome must be a survival object Surv(time,failure)")
}
if(attr(y,"type")!="right")
{
stop("gbm() currently only handles right censored observations")
}
Misc <- y[,2]
y <- y[,1]
# reverse sort the failure times to compute risk sets on the fly
i.train <- order(-y[1:nTrain])
n.test <- cRows - nTrain
if(n.test > 0)
{
i.test <- order(-y[(nTrain+1):cRows]) + nTrain
}
else
{
i.test <- NULL
}
i.timeorder <- c(i.train,i.test)
y <- y[i.timeorder]
Misc <- Misc[i.timeorder]
x <- x[i.timeorder,,drop=FALSE]
w <- w[i.timeorder]
if(is.null(offset)) offset <- offset[i.timeorder]
}
if(distribution$name == "tdist")
{
if (is.null(distribution$df) || !is.numeric(distribution$df)){
Misc <- 4
}
else {
Misc <- distribution$df[1]
}
}
if (distribution$name == "multinomial")
{
## Ensure that the training set contains all classes
classes <- attr(factor(y), "levels")
nClass <- length(classes)
if (nClass > nTrain){
stop(paste("Number of classes (", nClass,
") must be less than the size of the training set (", nTrain, ")",
sep = ""))
}
# f <- function(a,x){
# min((1:length(x))[x==a])
# }
new.idx <- as.vector(sapply(classes, function(a,x){ min((1:length(x))[x==a]) }, y))
all.idx <- 1:length(y)
new.idx <- c(new.idx, all.idx[!(all.idx %in% new.idx)])
y <- y[new.idx]
x <- x[new.idx, ]
w <- w[new.idx]
if (!is.null(offset)){
offset <- offset[new.idx]
}
## Get the factors
y <- as.numeric(as.vector(outer(y, classes, "==")))
## Fill out the weight and offset
w <- rep(w, nClass)
if (!is.null(offset)){
offset <- rep(offset, nClass)
}
} # close if (dist... == "multinomial"
if(distribution$name == "pairwise")
{
distribution.metric <- distribution[["metric"]]
if (!is.null(distribution.metric))
{
distribution.metric <- tolower(distribution.metric)
supported.metrics <- c("conc", "ndcg", "map", "mrr")
if (!is.element(distribution.metric, supported.metrics))
{
stop("Metric '", distribution.metric, "' is not supported, use either 'conc', 'ndcg', 'map', or 'mrr'")
}
metric <- distribution.metric
}
else
{
warning("No metric specified, using 'ndcg'")
metric <- "ndcg" # default
distribution[["metric"]] <- metric
}
if (any(y<0))
{
stop("targets for 'pairwise' should be non-negative")
}
if (is.element(metric, c("mrr", "map")) && (!all(is.element(y, 0:1))))
{
stop("Metrics 'map' and 'mrr' require the response to be in {0,1}")
}
# Cut-off rank for metrics
# Default of 0 means no cutoff
max.rank <- 0
if (!is.null(distribution[["max.rank"]]) && distribution[["max.rank"]] > 0)
{
if (is.element(metric, c("ndcg", "mrr")))
{
max.rank <- distribution[["max.rank"]]
}
else
{
stop("Parameter 'max.rank' cannot be specified for metric '", distribution.metric, "', only supported for 'ndcg' and 'mrr'")
}
}
# We pass the cut-off rank to the C function as the last element in the Misc vector
Misc <- c(group, max.rank)
distribution.call.name <- sprintf("pairwise_%s", metric)
} # close if (dist... == "pairwise"
# create index upfront... subtract one for 0 based order
x.order <- apply(x[1:nTrain,,drop=FALSE],2,order,na.last=FALSE)-1
x <- as.vector(data.matrix(x))
if(is.null(var.monotone)) var.monotone <- rep(0,cCols)
else if(length(var.monotone)!=cCols)
{
stop("Length of var.monotone != number of predictors")
}
else if(!all(is.element(var.monotone,-1:1)))
{
stop("var.monotone must be -1, 0, or 1")
}
gbm.obj <- .Call("gbm",
Y=as.double(y),
Offset=as.double(offset),
X=matrix(x, cRows, cCols),
X.order=as.integer(x.order),
weights=as.double(w),
Misc=as.double(Misc),
var.type=as.integer(var.type),
var.monotone=as.integer(var.monotone),
distribution=as.character(distribution.call.name),
n.trees=as.integer(n.trees),
interaction.depth=as.integer(interaction.depth),
n.minobsinnode=as.integer(n.minobsinnode),
n.classes = as.integer(nClass),
shrinkage=as.double(shrinkage),
bag.fraction=as.double(bag.fraction),
nTrain=as.integer(nTrain),
mFeatures=as.integer(mFeatures),
fit.old=as.double(NA),
n.cat.splits.old=as.integer(0),
n.trees.old=as.integer(0),
verbose=as.integer(verbose),
PACKAGE = "gbm")
gbm.obj$bag.fraction <- bag.fraction
gbm.obj$distribution <- distribution
gbm.obj$interaction.depth <- interaction.depth
gbm.obj$n.minobsinnode <- n.minobsinnode
gbm.obj$num.classes <- nClass
gbm.obj$n.trees <- length(gbm.obj$trees) / nClass
gbm.obj$nTrain <- nTrain
gbm.obj$mFeatures <- mFeatures
gbm.obj$train.fraction <- train.fraction
gbm.obj$response.name <- response.name
gbm.obj$shrinkage <- shrinkage
gbm.obj$var.levels <- var.levels
gbm.obj$var.monotone <- var.monotone
gbm.obj$var.names <- var.names
gbm.obj$var.type <- var.type
gbm.obj$verbose <- verbose
gbm.obj$Terms <- NULL
if(distribution$name == "coxph")
{
gbm.obj$fit[i.timeorder] <- gbm.obj$fit
}
## If K-Classification is used then split the fit and tree components
if (distribution$name == "multinomial"){
gbm.obj$fit <- matrix(gbm.obj$fit, ncol = nClass)
dimnames(gbm.obj$fit)[[2]] <- classes
gbm.obj$classes <- classes
## Also get the class estimators
exp.f <- exp(gbm.obj$fit)
denom <- matrix(rep(rowSums(exp.f), nClass), ncol = nClass)
gbm.obj$estimator <- exp.f/denom
}
if(keep.data)
{
if(distribution$name == "coxph")
{
# put the observations back in order
gbm.obj$data <- list(y=y,x=x,x.order=x.order,offset=offset,Misc=Misc,w=w,
i.timeorder=i.timeorder)
}
else if ( distribution$name == "multinomial" ){
# Restore original order of the data
new.idx <- order( new.idx )
gbm.obj$data <- list( y=as.vector(matrix(y, ncol=length(classes),byrow=FALSE)[new.idx,]),
x=as.vector(matrix(x, ncol=length(var.names), byrow=FALSE)[new.idx,]),
x.order=x.order,
offset=offset[new.idx],
Misc=Misc, w=w[new.idx] )
}
else
{
gbm.obj$data <- list(y=y,x=x,x.order=x.order,offset=offset,Misc=Misc,w=w)
}
}
else
{
gbm.obj$data <- NULL
}
class(gbm.obj) <- "gbm"
return(gbm.obj)
}