Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastiaanHoppner authored May 26, 2020
1 parent 5b81315 commit 0ccbfbb
Show file tree
Hide file tree
Showing 11 changed files with 250 additions and 90 deletions.
12 changes: 1 addition & 11 deletions csboost/R/checkInputsCSboost.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
checkInputsCSboost <- function (formula, train, test,
cost_matrix_train, cost_matrix_test,
hessian_type, hessian_constant) {
cost_matrix_train, cost_matrix_test) {
# check inputs
if (missing(formula)) {
stop("argument 'formula' is missing, with no default")
Expand All @@ -17,15 +16,6 @@ checkInputsCSboost <- function (formula, train, test,
} else if (!is.matrix(cost_matrix_train) | any(dim(cost_matrix_train) != c(NROW(train), 2))) {
stop("argument 'cost_matrix_train' must be a matrix of dimension nrow(train) x 2")
}
if (missing(hessian_type)) {
stop("argument 'hessian_type' is missing, with no default")
} else if (!hessian_type %in% c("exact", "solution1", "solution2", "constant")) {
stop("argument 'hessian_type' must be either 'exact', 'solution1', 'solution2' or 'constant'")
} else if (hessian_type == "constant") {
if (is.null(hessian_constant)) {
stop("'hessian_constant' must be specified when 'hessian_type' is 'constant'")
}
}
if (!is.null(test)) {
if (!is.data.frame(test)) {
stop("argument 'test' must be a data frame")
Expand Down
38 changes: 11 additions & 27 deletions csboost/R/csboost.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
csboost <- function (formula, train, test = NULL,
cost_matrix_train, cost_matrix_test = NULL,
hessian_type, hessian_constant = NULL,
nrounds, params = list(),
verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL,
save_period = NULL, save_name = "xgboost.model",
Expand All @@ -12,13 +11,13 @@ csboost <- function (formula, train, test = NULL,
call <- match.call()

# check inputs
check <- checkInputsCSboost(formula, train, test, cost_matrix_train, cost_matrix_test,
hessian_type, hessian_constant)
check <- checkInputsCSboost(formula, train, test, cost_matrix_train, cost_matrix_test)

# convert data to xgb.DMatrix & build watchlist
labels_train <- hmeasure::relabel(model.response(check$mf_train))
dtrain <- xgboost::xgb.DMatrix(data = model.matrix(formula, train), label = labels_train)
watchlist <- list(train = dtrain)

dtest <- NULL
if (!is.null(test)) {
labels_test <- hmeasure::relabel(model.response(check$mf_test))
Expand All @@ -29,12 +28,11 @@ csboost <- function (formula, train, test = NULL,
# rearrange cost matrix & define auxilary vectors
cost_matrix_train[labels_train == 0, ] <- cost_matrix_train[labels_train == 0, c(2, 1)]
cost_matrix_train_col2 <- cost_matrix_train[, 2]
cost_no_model_train <- sum(cost_matrix_train_col2)
diff_costs_train <- cost_matrix_train[, 1] - cost_matrix_train[, 2]

if (!is.null(test)) {
cost_matrix_test[labels_test == 0, ] <- cost_matrix_test[labels_test == 0, c(2, 1)]
cost_matrix_test_col2 <- cost_matrix_test[, 2]
cost_no_model_test <- sum(cost_matrix_test_col2)
diff_costs_test <- cost_matrix_test[, 1] - cost_matrix_test[, 2]
}

Expand All @@ -46,46 +44,34 @@ csboost <- function (formula, train, test = NULL,
names(dimnames(example_cost_matrix)) <- c(" Prediction", "Reference")

# define objective function
averageExpectedCostObj <- function (scores, dtrain) {
AecGradHess <- function (scores, dtrain) {
scores <- 1 / (1 + exp(-scores))
grad <- scores * (1 - scores) * diff_costs_train
if (hessian_type == "exact") {
hess <- (1 - 2 * scores) * grad
} else if (hessian_type == "solution1") {
hess <- (1 - 2 * scores) * grad
hess[which(hess < 0)] <- 0
} else if (hessian_type == "solution2") {
hess <- abs((1 - 2 * scores) * grad)
} else if (hessian_type == "constant") {
hess <- rep(hessian_constant, length(scores))
}
hess <- abs((1 - 2 * scores) * grad)
return(list(grad = grad, hess = hess))
}

# define evaluation function
expectedSavings <- function (scores, DMatrix) {
AEC <- function (scores, DMatrix) {
scores <- 1 / (1 + exp(-scores))
if (length(scores) == NROW(dtrain)) {
cost_matrix_col2 <- cost_matrix_train_col2
cost_no_model <- cost_no_model_train
diff_costs <- diff_costs_train
} else if (length(scores) == NROW(dtest)) {
cost_matrix_col2 <- cost_matrix_test_col2
cost_no_model <- cost_no_model_test
diff_costs <- diff_costs_test
}
expected_cost <- sum(cost_matrix_col2 + scores * diff_costs)
expected_savings <- (cost_no_model - expected_cost) / cost_no_model
return(list(metric = "expected savings", value = expected_savings))
average_expected_cost <- mean(cost_matrix_col2 + scores * diff_costs)
return(list(metric = "AEC", value = average_expected_cost))
}

# fit xgboost
params$objective <- averageExpectedCostObj
params$eval_metric <- expectedSavings
params$objective <- AecGradHess
params$eval_metric <- AEC

xgbmodel <- xgboost::xgb.train(params, dtrain, nrounds, watchlist,
verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = TRUE,
early_stopping_rounds = early_stopping_rounds, maximize = FALSE,
save_period = save_period, save_name = save_name,
xgb_model = xgb_model, ...)

Expand All @@ -94,8 +80,6 @@ csboost <- function (formula, train, test = NULL,

# output
xgbmodel$params <- c(xgbmodel$params, list(formula = formula,
hessian_type = hessian_type,
hessian_constant = hessian_constant,
nrounds = nrounds,
early_stopping_rounds = early_stopping_rounds,
example_cost_matrix = example_cost_matrix))
Expand Down
27 changes: 20 additions & 7 deletions csboost/R/plot.csboost.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,32 @@
plot.csboost <- function (x, ...) {
plot.csboost <- function (x, legend_position = NULL, ...) {
# check inputs
if (missing(x)) {
stop("argument 'x' is missing, with no default")
}
if (class(x) != "csboost") {
stop("argument 'x' must be of class 'csboost'")
}
if (!is.null(legend_position)) {
if (!legend_position %in% c("bottomright", "bottom", "bottomleft", "left", "topleft",
"top", "topright", "right", "center")) {
stop(paste("'legend_position' should be one of \"bottomright\", \"bottom\", \"bottomleft\",",
"\"left\", \"topleft\", \"top\", \"topright\", \"right\", \"center\""))
}
}

# plot expected savings versus iteration
# plot average expected cost versus iteration
evallog <- x$xgbmodel$evaluation_log
plot(evallog$iter, unlist(evallog[, 2]), type = "l", lwd = 2,
ylim = range(evallog[, 2:NCOL(evallog)]), ylab = "expected savings", xlab = "iteration")
ylimit <- range(evallog[, 2:NCOL(evallog)])

plot(evallog$iter, unlist(evallog[, 2]), type = "l", ylim = ylimit,
ylab = "average expected cost", xlab = "iteration", ...)

if (NCOL(evallog) == 3) {
lines(evallog$iter, unlist(evallog[, 3]), lty = 2, lwd = 2)
legend("right", legend = c("train", "test"), lty = c(1, 2), lwd = 2)
lines(evallog$iter, unlist(evallog[, 3]), lty = 2, ...)
if (is.null(legend_position)) {
legend_position <- "top"
}
abline(v = x$xgbmodel$best_iteration, lty = 4, ...)
legend(legend_position, legend = c("train", "test"), lty = c(1, 2), lwd = 2)
}
abline(v = x$xgbmodel$best_iteration, lty = 3, lwd = 2)
}
80 changes: 55 additions & 25 deletions csboost/R/summary.csboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,60 @@ summary.csboost <- function (object, ...) {
# print summary
xgbmodel <- object$xgbmodel
params <- xgbmodel$params
hessian_constant <- params$hessian_constant
hessian_constant <- ifelse(is.null(hessian_constant), "NULL", hessian_constant)
best_performance <- xgbmodel$evaluation_log[xgbmodel$best_iteration, ]

cat("SETTINGS -------------------------------------------------------------------------------\n")
cat(paste(" - hessian_type =", params$hessian_type, "\n"))
cat(paste(" - hessian_constant =", hessian_constant, "\n"))
cat(paste(" - nrounds =", params$nrounds, "\n"))
cat(paste(" - early_stopping_rounds =", params$early_stopping_rounds, "\n"))
cat(paste(" - booster =", params$booster, "\n"))
cat(paste(" - etas =", params$eta, "\n"))
cat(paste(" - gamma =", params$gamma, "\n"))
cat(paste(" - max_depth =", params$max_depth, "\n"))
cat(paste(" - min_child_weight =", params$min_child_weight, "\n"))
cat(paste(" - max_delta_step =", params$max_delta_step, "\n"))
cat(paste(" - subsample =", params$subsample, "\n"))
cat(paste(" - colsample_bytree =", params$colsample_bytree, "\n"))
cat(paste(" - colsample_bylevel =", params$colsample_bylevel, "\n"))
cat(paste(" - colsample_bynode =", params$colsample_bynode, "\n"))
cat(paste(" - lambda =", params$lambda, "\n"))
cat(paste(" - alpha =", params$alpha, "\n"))
cat(paste(" - scale_pos_weight =", params$scale_pos_weight, "\n"))
cat(paste(" - base_score =", params$base_score, "\n"))
cat(paste(" - nthread =", params$nthread, "\n"))
if (!is.null(params$nrounds)) {
cat(paste(" - nrounds =", params$nrounds, "\n"))
}
if (!is.null(params$early_stopping_rounds)) {
cat(paste(" - early_stopping_rounds =", params$early_stopping_rounds, "\n"))
}
if (!is.null(params$booster)) {
cat(paste(" - booster =", params$booster, "\n"))
}
if (!is.null(params$eta)) {
cat(paste(" - etas =", params$eta, "\n"))
}
if (!is.null(params$gamma)) {
cat(paste(" - gamma =", params$gamma, "\n"))
}
if (!is.null(params$max_depth)) {
cat(paste(" - max_depth =", params$max_depth, "\n"))
}
if (!is.null(params$min_child_weight)) {
cat(paste(" - min_child_weight =", params$min_child_weight, "\n"))
}
if (!is.null(params$max_delta_step)) {
cat(paste(" - max_delta_step =", params$max_delta_step, "\n"))
}
if (!is.null(params$subsample)) {
cat(paste(" - subsample =", params$subsample, "\n"))
}
if (!is.null(params$colsample_bytree)) {
cat(paste(" - colsample_bytree =", params$colsample_bytree, "\n"))
}
if (!is.null(params$colsample_bylevel)) {
cat(paste(" - colsample_bylevel =", params$colsample_bylevel, "\n"))
}
if (!is.null(params$colsample_bynode)) {
cat(paste(" - colsample_bynode =", params$colsample_bynode, "\n"))
}
if (!is.null(params$lambda)) {
cat(paste(" - lambda =", params$lambda, "\n"))
}
if (!is.null(params$alpha)) {
cat(paste(" - alpha =", params$alpha, "\n"))
}
if (!is.null(params$scale_pos_weight)) {
cat(paste(" - scale_pos_weight =", params$scale_pos_weight, "\n"))
}
if (!is.null(params$base_score)) {
cat(paste(" - base_score =", params$base_score, "\n"))
}
if (!is.null(params$nthread)) {
cat(paste(" - nthread =", params$nthread, "\n"))
}
cat(paste(" - cost matrix (example) = \n"))
print(params$example_cost_matrix)
cat(paste(" -", xgbmodel$nfeatures, "features:\n "))
Expand All @@ -45,11 +75,11 @@ summary.csboost <- function (object, ...) {
cat(paste(" - time =", object$time, "seconds\n"))
cat(paste(" - best iteration =", xgbmodel$best_iteration, "\n"))
cat(paste(" - best ntreelimit =", xgbmodel$best_ntreelimit, "\n"))
cat(paste(" - best expected savings (train) =",
round(best_performance$`train_expected savings`, 6), "\n"))
cat(paste(" - best average expected cost (train) =",
round(best_performance$train_AEC, 6), "\n"))
if (NCOL(xgbmodel$evaluation_log) == 3) {
cat(paste(" - best expected savings (test) =",
round(best_performance$`test_expected savings`, 6), "\n"))
cat(paste(" - best average expected cost (test) =",
round(best_performance$test_AEC, 6), "\n"))
}
cat("\n")
}
Binary file added csboost/data/creditcard.RData
Binary file not shown.
17 changes: 17 additions & 0 deletions csboost/man/creditcard.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
\name{creditcard}
\docType{data}
\alias{creditcard}
\title{Creditcard Transaction Data}
\description{
Transactions made by credit cards in September 2013 by European cardholders. This data set presents a small subset of transactions that occurred in two days, where we have 465 frauds out of 1409 transactions. It contains only numerical input variables which are the result of a PCA transformation. Due to confidentiality issues, the original features and more background information about the data cannot be provided. Features V1, V2, ..., V28 are the principal components obtained with PCA. The only feature which has not been transformed with PCA isAmountwhich is the transaction amount. FeatureClassis the response variable which takes value 1 in case of fraud and 0 otherwise.
}
\usage{data(creditcard)}
\format{A data frame containing 1409 observations and 30 variables.}
\source{kaggle.com, made available by Andrea Dal Pozzolo et al., Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015.}
\keyword{datasets}
\examples{
data(creditcard)
str(creditcard)
head(creditcard)
summary(creditcard)
}
44 changes: 36 additions & 8 deletions csboost/man/csboost.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

\usage{csboost(formula, train, test = NULL,
cost_matrix_train, cost_matrix_test = NULL,
hessian_type, hessian_constant = NULL,
nrounds, params = list(),
verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL,
save_period = NULL, save_name = "xgboost.model",
Expand All @@ -25,10 +24,6 @@

\item{cost_matrix_test}{a matrix of dimension \code{nrow(test)} x 2 (if provided). For each instance, the first/second column contains the cost of correctly/incorrectly predicting the binary class of the instance (default is \code{NULL}).}

\item{hessian_type}{type of approach for calculating the second order gradient. Possible types are \code{"exact"}, \code{"solution1"}, \code{"solution2"} and \code{"constant"}.}

\item{hessian_constant}{numeric value for the constant second order gradient when \code{hessian_type = "constant"}.}

\item{nrounds}{max number of boosting iterations.}

\item{params}{the list of parameters. The complete list of parameters is available at \href{https://xgboost.readthedocs.io/en/latest/parameter.html}{https://xgboost.readthedocs.io/en/latest/parameter.html} A short summary is available in the documentation \code{help(xgb.train)}.}
Expand Down Expand Up @@ -80,13 +75,46 @@
}
}
\references{...}
\references{Hoppner, S., Baesens, B., Verbeke, W., and Verdonck, T. (2020). Instance- dependent cost-sensitive learning for detecting transfer fraud. \emph{arXiv:2005.02488}}
\author{Sebastiaan Höppner}
\author{Sebastiaan Hoppner}
\seealso{\code{\link{summary.csboost}}, \code{\link{plot.csboost}}, \code{\link{predict.csboost}}}
\examples{
# Provide an example (TODO).
library(csboost)
data(creditcard)
fixed_cost <- 50
cost_matrix <- matrix(nrow = nrow(creditcard), ncol = 2)
cost_matrix[, 1] <- ifelse(creditcard$Class == 1, fixed_cost, 0)
cost_matrix[, 2] <- ifelse(creditcard$Class == 1, creditcard$Amount, fixed_cost)
i0 <- which(creditcard$Class == 0)
i1 <- which(creditcard$Class == 1)
set.seed(2020)
i0_train <- sample(i0, size = 0.7 * length(i0))
i1_train <- sample(i1, size = 0.7 * length(i1))
train <- creditcard[ c(i0_train, i1_train), ]
test <- creditcard[-c(i0_train, i1_train), ]
cost_matrix_train <- cost_matrix[ c(i0_train, i1_train), ]
cost_matrix_test <- cost_matrix[-c(i0_train, i1_train), ]
csbtree <- csboost(formula = Class ~ . - 1,
train = train,
test = test,
cost_matrix_train = cost_matrix_train,
cost_matrix_test = cost_matrix_test,
nrounds = 300,
early_stopping_rounds = 20,
verbose = 1,
print_every_n = 1)
summary(csbtree)
plot(csbtree)
predict(csbtree, newdata = test)
}
Loading

0 comments on commit 0ccbfbb

Please sign in to comment.