Add files via upload

SebastiaanHoppner · May 26, 2020 · 0ccbfbb · 0ccbfbb
1 parent 5b81315
commit 0ccbfbb
Show file tree

Hide file tree

Showing 11 changed files with 250 additions and 90 deletions.
diff --git a/csboost/R/checkInputsCSboost.R b/csboost/R/checkInputsCSboost.R
@@ -1,6 +1,5 @@
 checkInputsCSboost <- function (formula, train, test,
-                                cost_matrix_train, cost_matrix_test,
-                                hessian_type, hessian_constant) {
+                                cost_matrix_train, cost_matrix_test) {
   # check inputs
   if (missing(formula)) {
     stop("argument 'formula' is missing, with no default")
@@ -17,15 +16,6 @@ checkInputsCSboost <- function (formula, train, test,
   } else if (!is.matrix(cost_matrix_train) | any(dim(cost_matrix_train) != c(NROW(train), 2))) {
     stop("argument 'cost_matrix_train' must be a matrix of dimension nrow(train) x 2")
   }
-  if (missing(hessian_type)) {
-    stop("argument 'hessian_type' is missing, with no default")
-  } else if (!hessian_type %in% c("exact", "solution1", "solution2", "constant")) {
-    stop("argument 'hessian_type' must be either 'exact', 'solution1', 'solution2' or 'constant'")
-  } else if (hessian_type == "constant") {
-    if (is.null(hessian_constant)) {
-      stop("'hessian_constant' must be specified when 'hessian_type' is 'constant'")
-    }
-  }
   if (!is.null(test)) {
     if (!is.data.frame(test)) {
       stop("argument 'test' must be a data frame")

diff --git a/csboost/R/csboost.R b/csboost/R/csboost.R
@@ -1,6 +1,5 @@
 csboost <- function (formula, train, test = NULL,
                      cost_matrix_train, cost_matrix_test = NULL,
-                     hessian_type, hessian_constant = NULL,
                      nrounds, params = list(),
                      verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL,
                      save_period = NULL, save_name = "xgboost.model",
@@ -12,13 +11,13 @@ csboost <- function (formula, train, test = NULL,
   call <- match.call()
 
   # check inputs
-  check <- checkInputsCSboost(formula, train, test, cost_matrix_train, cost_matrix_test,
-                              hessian_type, hessian_constant)
+  check <- checkInputsCSboost(formula, train, test, cost_matrix_train, cost_matrix_test)
 
   # convert data to xgb.DMatrix & build watchlist
   labels_train <- hmeasure::relabel(model.response(check$mf_train))
   dtrain <- xgboost::xgb.DMatrix(data = model.matrix(formula, train), label = labels_train)
   watchlist <- list(train = dtrain)
+
   dtest <- NULL
   if (!is.null(test)) {
     labels_test <- hmeasure::relabel(model.response(check$mf_test))
@@ -29,12 +28,11 @@ csboost <- function (formula, train, test = NULL,
   # rearrange cost matrix & define auxilary vectors
   cost_matrix_train[labels_train == 0, ] <- cost_matrix_train[labels_train == 0, c(2, 1)]
   cost_matrix_train_col2 <- cost_matrix_train[, 2]
-  cost_no_model_train <- sum(cost_matrix_train_col2)
   diff_costs_train <- cost_matrix_train[, 1] - cost_matrix_train[, 2]
+
   if (!is.null(test)) {
     cost_matrix_test[labels_test == 0, ] <- cost_matrix_test[labels_test == 0, c(2, 1)]
     cost_matrix_test_col2 <- cost_matrix_test[, 2]
-    cost_no_model_test <- sum(cost_matrix_test_col2)
     diff_costs_test <- cost_matrix_test[, 1] - cost_matrix_test[, 2]
   }
 
@@ -46,46 +44,34 @@ csboost <- function (formula, train, test = NULL,
   names(dimnames(example_cost_matrix)) <- c("      Prediction", "Reference")
 
   # define objective function
-  averageExpectedCostObj <- function (scores, dtrain) {
+  AecGradHess <- function (scores, dtrain) {
     scores <- 1 / (1 + exp(-scores))
     grad <- scores * (1 - scores) * diff_costs_train
-    if (hessian_type == "exact") {
-      hess <- (1 - 2 * scores) * grad
-    } else if (hessian_type == "solution1") {
-      hess <- (1 - 2 * scores) * grad
-      hess[which(hess < 0)] <- 0
-    } else if (hessian_type == "solution2") {
-      hess <- abs((1 - 2 * scores) * grad)
-    } else if (hessian_type == "constant") {
-      hess <- rep(hessian_constant, length(scores))
-    }
+    hess <- abs((1 - 2 * scores) * grad)
     return(list(grad = grad, hess = hess))
   }
 
   # define evaluation function
-  expectedSavings <- function (scores, DMatrix) {
+  AEC <- function (scores, DMatrix) {
     scores <- 1 / (1 + exp(-scores))
     if (length(scores) == NROW(dtrain)) {
       cost_matrix_col2 <- cost_matrix_train_col2
-      cost_no_model <- cost_no_model_train
       diff_costs <- diff_costs_train
     } else if (length(scores) == NROW(dtest)) {
       cost_matrix_col2 <- cost_matrix_test_col2
-      cost_no_model <- cost_no_model_test
       diff_costs <- diff_costs_test
     }
-    expected_cost <- sum(cost_matrix_col2 + scores * diff_costs)
-    expected_savings <- (cost_no_model - expected_cost) / cost_no_model
-    return(list(metric = "expected savings", value = expected_savings))
+    average_expected_cost <- mean(cost_matrix_col2 + scores * diff_costs)
+    return(list(metric = "AEC", value = average_expected_cost))
   }
 
   # fit xgboost
-  params$objective <- averageExpectedCostObj
-  params$eval_metric <- expectedSavings
+  params$objective <- AecGradHess
+  params$eval_metric <- AEC
 
   xgbmodel <- xgboost::xgb.train(params, dtrain, nrounds, watchlist,
                                  verbose = verbose, print_every_n = print_every_n,
-                                 early_stopping_rounds = early_stopping_rounds, maximize = TRUE,
+                                 early_stopping_rounds = early_stopping_rounds, maximize = FALSE,
                                  save_period = save_period, save_name = save_name,
                                  xgb_model = xgb_model, ...)
 
@@ -94,8 +80,6 @@ csboost <- function (formula, train, test = NULL,
 
   # output
   xgbmodel$params <- c(xgbmodel$params, list(formula               = formula,
-                                             hessian_type          = hessian_type,
-                                             hessian_constant      = hessian_constant,
                                              nrounds               = nrounds,
                                              early_stopping_rounds = early_stopping_rounds,
                                              example_cost_matrix   = example_cost_matrix))

diff --git a/csboost/R/plot.csboost.R b/csboost/R/plot.csboost.R
@@ -1,19 +1,32 @@
-plot.csboost <- function (x, ...) {
+plot.csboost <- function (x, legend_position = NULL, ...) {
   # check inputs
   if (missing(x)) {
     stop("argument 'x' is missing, with no default")
   }
   if (class(x) != "csboost") {
     stop("argument 'x' must be of class 'csboost'")
   }
+  if (!is.null(legend_position)) {
+    if (!legend_position %in% c("bottomright", "bottom", "bottomleft", "left", "topleft",
+                                "top", "topright", "right", "center")) {
+      stop(paste("'legend_position' should be one of \"bottomright\", \"bottom\", \"bottomleft\",",
+                 "\"left\", \"topleft\", \"top\", \"topright\", \"right\", \"center\""))
+    }
+  }
 
-  # plot expected savings versus iteration
+  # plot average expected cost versus iteration
   evallog <- x$xgbmodel$evaluation_log
-  plot(evallog$iter, unlist(evallog[, 2]), type = "l", lwd = 2,
-       ylim = range(evallog[, 2:NCOL(evallog)]), ylab = "expected savings", xlab = "iteration")
+  ylimit <- range(evallog[, 2:NCOL(evallog)])
+
+  plot(evallog$iter, unlist(evallog[, 2]), type = "l", ylim = ylimit,
+       ylab = "average expected cost", xlab = "iteration", ...)
+
   if (NCOL(evallog) == 3) {
-    lines(evallog$iter, unlist(evallog[, 3]), lty = 2, lwd = 2)
-    legend("right", legend = c("train", "test"), lty = c(1, 2), lwd = 2)
+    lines(evallog$iter, unlist(evallog[, 3]), lty = 2, ...)
+    if (is.null(legend_position)) {
+      legend_position <- "top"
+    }
+    abline(v = x$xgbmodel$best_iteration, lty = 4, ...)
+    legend(legend_position, legend = c("train", "test"), lty = c(1, 2), lwd = 2)
   }
-  abline(v = x$xgbmodel$best_iteration, lty = 3, lwd = 2)
 }
diff --git a/csboost/R/summary.csboost.R b/csboost/R/summary.csboost.R
@@ -10,30 +10,60 @@ summary.csboost <- function (object, ...) {
   # print summary
   xgbmodel <- object$xgbmodel
   params <- xgbmodel$params
-  hessian_constant <- params$hessian_constant
-  hessian_constant <- ifelse(is.null(hessian_constant), "NULL", hessian_constant)
   best_performance <- xgbmodel$evaluation_log[xgbmodel$best_iteration, ]
 
   cat("SETTINGS -------------------------------------------------------------------------------\n")
-  cat(paste("  - hessian_type =",          params$hessian_type,          "\n"))
-  cat(paste("  - hessian_constant =",      hessian_constant,             "\n"))
-  cat(paste("  - nrounds =",               params$nrounds,               "\n"))
-  cat(paste("  - early_stopping_rounds =", params$early_stopping_rounds, "\n"))
-  cat(paste("  - booster =",               params$booster,               "\n"))
-  cat(paste("  - etas =",                  params$eta,                   "\n"))
-  cat(paste("  - gamma =",                 params$gamma,                 "\n"))
-  cat(paste("  - max_depth =",             params$max_depth,             "\n"))
-  cat(paste("  - min_child_weight =",      params$min_child_weight,      "\n"))
-  cat(paste("  - max_delta_step =",        params$max_delta_step,        "\n"))
-  cat(paste("  - subsample =",             params$subsample,             "\n"))
-  cat(paste("  - colsample_bytree =",      params$colsample_bytree,      "\n"))
-  cat(paste("  - colsample_bylevel =",     params$colsample_bylevel,     "\n"))
-  cat(paste("  - colsample_bynode =",      params$colsample_bynode,      "\n"))
-  cat(paste("  - lambda =",                params$lambda,                "\n"))
-  cat(paste("  - alpha =",                 params$alpha,                 "\n"))
-  cat(paste("  - scale_pos_weight =",      params$scale_pos_weight,      "\n"))
-  cat(paste("  - base_score =",            params$base_score,            "\n"))
-  cat(paste("  - nthread =",               params$nthread,               "\n"))
+  if (!is.null(params$nrounds)) {
+    cat(paste("  - nrounds =",               params$nrounds,               "\n"))
+  }
+  if (!is.null(params$early_stopping_rounds)) {
+    cat(paste("  - early_stopping_rounds =", params$early_stopping_rounds, "\n"))
+  }
+  if (!is.null(params$booster)) {
+    cat(paste("  - booster =",               params$booster,               "\n"))
+  }
+  if (!is.null(params$eta)) {
+    cat(paste("  - etas =",                  params$eta,                   "\n"))
+  }
+  if (!is.null(params$gamma)) {
+    cat(paste("  - gamma =",                 params$gamma,                 "\n"))
+  }
+  if (!is.null(params$max_depth)) {
+    cat(paste("  - max_depth =",             params$max_depth,             "\n"))
+  }
+  if (!is.null(params$min_child_weight)) {
+    cat(paste("  - min_child_weight =",      params$min_child_weight,      "\n"))
+  }
+  if (!is.null(params$max_delta_step)) {
+    cat(paste("  - max_delta_step =",        params$max_delta_step,        "\n"))
+  }
+  if (!is.null(params$subsample)) {
+    cat(paste("  - subsample =",             params$subsample,             "\n"))
+  }
+  if (!is.null(params$colsample_bytree)) {
+    cat(paste("  - colsample_bytree =",      params$colsample_bytree,      "\n"))
+  }
+  if (!is.null(params$colsample_bylevel)) {
+    cat(paste("  - colsample_bylevel =",     params$colsample_bylevel,     "\n"))
+  }
+  if (!is.null(params$colsample_bynode)) {
+    cat(paste("  - colsample_bynode =",      params$colsample_bynode,      "\n"))
+  }
+  if (!is.null(params$lambda)) {
+    cat(paste("  - lambda =",                params$lambda,                "\n"))
+  }
+  if (!is.null(params$alpha)) {
+    cat(paste("  - alpha =",                 params$alpha,                 "\n"))
+  }
+  if (!is.null(params$scale_pos_weight)) {
+    cat(paste("  - scale_pos_weight =",      params$scale_pos_weight,      "\n"))
+  }
+  if (!is.null(params$base_score)) {
+    cat(paste("  - base_score =",            params$base_score,            "\n"))
+  }
+  if (!is.null(params$nthread)) {
+    cat(paste("  - nthread =",               params$nthread,               "\n"))
+  }
   cat(paste("  - cost matrix (example) = \n"))
   print(params$example_cost_matrix)
   cat(paste("  -", xgbmodel$nfeatures, "features:\n      "))
@@ -45,11 +75,11 @@ summary.csboost <- function (object, ...) {
   cat(paste("  - time =",            object$time, "seconds\n"))
   cat(paste("  - best iteration =",  xgbmodel$best_iteration, "\n"))
   cat(paste("  - best ntreelimit =", xgbmodel$best_ntreelimit, "\n"))
-  cat(paste("  - best expected savings (train) =",
-            round(best_performance$`train_expected savings`, 6), "\n"))
+  cat(paste("  - best average expected cost (train) =",
+            round(best_performance$train_AEC, 6), "\n"))
   if (NCOL(xgbmodel$evaluation_log) == 3) {
-    cat(paste("  - best expected savings (test) =",
-              round(best_performance$`test_expected savings`, 6), "\n"))
+    cat(paste("  - best average expected cost (test) =",
+              round(best_performance$test_AEC, 6), "\n"))
   }
   cat("\n")
 }
diff --git a/csboost/data/creditcard.RData b/csboost/data/creditcard.RData
diff --git a/csboost/man/creditcard.Rd b/csboost/man/creditcard.Rd
@@ -0,0 +1,17 @@
+\name{creditcard}
+\docType{data}
+\alias{creditcard}
+\title{Creditcard Transaction Data}
+\description{
+Transactions made by credit cards in September 2013 by European cardholders. This data set presents a small subset of transactions that occurred in two days, where we have 465 frauds out of 1409 transactions. It contains only numerical input variables which are the result of a PCA transformation. Due to confidentiality issues, the original features and more background information about the data cannot be provided. Features V1, V2, ..., V28 are the principal components obtained with PCA. The only feature which has not been transformed with PCA is ‘Amount’ which is the transaction amount. Feature ‘Class’ is the response variable which takes value 1 in case of fraud and 0 otherwise.
+}
+\usage{data(creditcard)}
+\format{A data frame containing 1409 observations and 30 variables.}
+\source{kaggle.com, made available by Andrea Dal Pozzolo et al., Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015.}
+\keyword{datasets}
+\examples{
+data(creditcard)
+str(creditcard)
+head(creditcard)
+summary(creditcard)
+}
diff --git a/csboost/man/csboost.Rd b/csboost/man/csboost.Rd
@@ -8,7 +8,6 @@
 
 \usage{csboost(formula, train, test = NULL,
         cost_matrix_train, cost_matrix_test = NULL,
-        hessian_type, hessian_constant = NULL,
         nrounds, params = list(),
         verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL,
         save_period = NULL, save_name = "xgboost.model",
@@ -25,10 +24,6 @@
 
 \item{cost_matrix_test}{a matrix of dimension \code{nrow(test)} x 2 (if provided). For each instance, the first/second column contains the cost of correctly/incorrectly predicting the binary class of the instance (default is \code{NULL}).}
 
-\item{hessian_type}{type of approach for calculating the second order gradient. Possible types are \code{"exact"}, \code{"solution1"}, \code{"solution2"} and \code{"constant"}.}
-
-\item{hessian_constant}{numeric value for the constant second order gradient when \code{hessian_type = "constant"}.}
-
 \item{nrounds}{max number of boosting iterations.}
 
 \item{params}{the list of parameters. The complete list of parameters is available at \href{https://xgboost.readthedocs.io/en/latest/parameter.html}{https://xgboost.readthedocs.io/en/latest/parameter.html} A short summary is available in the documentation \code{help(xgb.train)}.}
@@ -80,13 +75,46 @@
 }
 }
 
-\references{...}
+\references{Hoppner, S., Baesens, B., Verbeke, W., and Verdonck, T. (2020). Instance- dependent cost-sensitive learning for detecting transfer fraud. \emph{arXiv:2005.02488}}
 
-\author{Sebastiaan Höppner}
+\author{Sebastiaan Hoppner}
 
 \seealso{\code{\link{summary.csboost}}, \code{\link{plot.csboost}}, \code{\link{predict.csboost}}}
 
 
 \examples{
-# Provide an example (TODO).
+library(csboost)
+data(creditcard)
+
+fixed_cost <- 50
+cost_matrix <- matrix(nrow = nrow(creditcard), ncol = 2)
+cost_matrix[, 1] <- ifelse(creditcard$Class == 1, fixed_cost, 0)
+cost_matrix[, 2] <- ifelse(creditcard$Class == 1, creditcard$Amount, fixed_cost)
+
+i0 <- which(creditcard$Class == 0)
+i1 <- which(creditcard$Class == 1)
+
+set.seed(2020)
+i0_train <- sample(i0, size = 0.7 * length(i0))
+i1_train <- sample(i1, size = 0.7 * length(i1))
+
+train <- creditcard[ c(i0_train, i1_train), ]
+test  <- creditcard[-c(i0_train, i1_train), ]
+
+cost_matrix_train <- cost_matrix[ c(i0_train, i1_train), ]
+cost_matrix_test  <- cost_matrix[-c(i0_train, i1_train), ]
+
+csbtree <- csboost(formula               = Class ~ . - 1,
+                   train                 = train,
+                   test                  = test,
+                   cost_matrix_train     = cost_matrix_train,
+                   cost_matrix_test      = cost_matrix_test,
+                   nrounds               = 300,
+                   early_stopping_rounds = 20,
+                   verbose               = 1,
+                   print_every_n         = 1)
+
+summary(csbtree)
+plot(csbtree)
+predict(csbtree, newdata = test)
 }