add functionality to pass forecasts and truth data separately into ev…

…al_forecasts(), add approproriate example data and a script to generate the example data
elray1 · Jan 15, 2021 · 7664730 · 7664730
1 parent a6ae2ca
commit 7664730
Show file tree

Hide file tree

Showing 12 changed files with 359 additions and 11 deletions.
diff --git a/R/data.R b/R/data.R
@@ -172,3 +172,44 @@
 #' }
 "binary_example_data"
 
+
+#' Quantile Example Data - Forecasts only
+#'
+#' A data set with predictions for different quantities relevant in the
+#' 2020 UK Covid-19 epidemic, but no true_values
+#'
+#' @format A data frame with 7,581 rows and 9 columns:
+#' \describe{
+#'   \item{value_date}{the date for which a prediction was made}
+#'   \item{value_type}{the target to be predicted (short form)}
+#'   \item{geography}{the region for which a prediction was made}
+#'   \item{model}{name of the model that generated the forecasts}
+#'   \item{creation_date}{date on which the forecast was made}
+#'   \item{quantile}{quantile of the corresponding prediction}
+#'   \item{prediction}{quantile predictions}
+#'   \item{value_desc}{long form description of the prediction target}
+#'   \item{horizon}{forecast horizon in days}
+#'
+#' }
+"example_quantile_forecasts_only"
+
+
+#' Truth data only
+#'
+#' A data set with truth data for different quantities relevant in the
+#' 2020 UK Covid-19 epidemic, but no predictions
+#'
+#' @format A data frame with 140 rows and 5 columns:
+#' \describe{
+#'   \item{value_date}{the date for which a prediction was made}
+#'   \item{value_type}{the target to be predicted (short form)}
+#'   \item{geography}{the region for which a prediction was made}
+#'   \item{value_desc}{long form description of the prediction target}
+#'   \item{true_value}{true observed values}
+#'
+#' }
+"example_truth_data_only"
+
+
+
+
diff --git a/R/eval_forecasts.R b/R/eval_forecasts.R
@@ -101,6 +101,14 @@
 #' @param summarised Summarise arguments (i.e. take the mean per group
 #' specified in group_by. Default is TRUE.
 #' @param verbose print out additional helpful messages (default is TRUE)
+#' @param forecasts data.frame with forecasts, that should follow the same
+#' general guidelines as the `data` input. Argument can be used to supply
+#' forecasts and truth data independently. Default is `NULL`.
+#' @param truth_data data.frame with a column called `true_value` to be merged
+#' with `forecasts`
+#' @param merge_by character vector with column names that `forecasts` and
+#' `truth_data` should be merged on. Default is `NULL` and merge will be
+#' attempted automatically.
 #'
 #' @return A data.table with appropriate scores. For binary predictions,
 #' the Brier Score will be returned, for quantile predictions the interval
@@ -169,7 +177,7 @@
 #' \url{https://doi.org/10.1371/journal.pcbi.1006785}
 #' @export
 
-eval_forecasts <- function(data,
+eval_forecasts <- function(data = NULL,
                            by = NULL,
                            summarise_by = by,
                            metrics = NULL,
@@ -180,15 +188,26 @@ eval_forecasts <- function(data,
                                                            separate_results = TRUE),
                            pit_plots = FALSE,
                            summarised = TRUE,
-                           verbose = TRUE) {
+                           verbose = TRUE,
+                           forecasts = NULL,
+                           truth_data = NULL,
+                           merge_by = NULL) {
 
 
   # preparations ---------------------------------------------------------------
   # check data argument is provided
-  if (!methods::hasArg("data")) {
-    stop("need arguments 'data'in function 'eval_forecasts()'")
+  if (is.null(data) && (is.null(truth_data) | is.null(forecasts))) {
+    stop("need arguments 'data' in function 'eval_forecasts()', or alternatively 'forecasts' and 'truth_data'")
+  }
+  if (is.null(data)) {
+    data <- merge_pred_and_obs(forecasts, truth_data, by = merge_by)
+    if (nrow(data) == 0) {
+      if (verbose) {
+        warning("After attempting to merge, only an empty data.table was left")
+      }
+      return(data)
+    }
   }
-  check_not_null(data = data)
 
   # do a copy to avoid that the input may be altered in any way.
   data <- data.table::as.data.table(data)

diff --git a/R/utils.R b/R/utils.R
@@ -5,7 +5,6 @@
 #' variable and the function call where the variable is missing. This function
 #' is a helper function that should only be called within other functions
 #' @param ... The variables to check
-#' @importFrom base deparse1
 #' @return The function returns `NULL`, but throws an error if the variable is
 #' missing.
 check_not_null <- function(...) {
@@ -77,6 +76,8 @@ globalVariables(c(".",
                   "boundary",
                   "brier_score",
                   "component_value",
+                  "..colnames_x",
+                  "..colnames_y",
                   "count",
                   "coverage_deviation",
                   "CRPS",

diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R
@@ -391,7 +391,8 @@ sample_to_range <- function(data,
 #' @export
 
 
-merge_pred_and_obs <- function(forecasts, observations, by = NULL) {
+merge_pred_and_obs <- function(forecasts, observations,
+                               by = NULL) {
 
   forecasts <- data.table::as.data.table(forecasts)
   observations <- data.table::as.data.table(observations)
@@ -406,9 +407,28 @@ merge_pred_and_obs <- function(forecasts, observations, by = NULL) {
   obs_cols <- colnames(observations)
   by <- intersect(by, obs_cols)
 
-  combined <- merge(observations, forecasts, by = by)
+  # do a left_join, where all data in the observations are kept.
+  combined <- merge(observations, forecasts, by = by, all.x = TRUE)
 
-  # maybe add some error handling here
+  # get colnames that are the same for x and y
+  colnames <- colnames(combined)
+  colnames_x <- colnames[endsWith(colnames, ".x")]
+  colnames_y <- colnames[endsWith(colnames, ".y")]
+
+  # extract basenames
+  basenames_x <- sub(".x$", "", colnames_x)
+  basenames_y <- sub(".y$", "", colnames_y)
+
+  # see whether the column name as well as the content is the same
+  overlapping <- (as.list(combined[, ..colnames_x]) %in% as.list(combined[, ..colnames_y])) & basenames_x == basenames_y
+  overlap_names <- colnames_x[overlapping]
+  basenames_overlap <- sub(".x$", "", overlap_names)
+
+  # delete overlapping columns
+  if (length(basenames_overlap > 0)) {
+    combined[, paste0(basenames_overlap, ".x") := NULL]
+    combined[, paste0(basenames_overlap, ".y") := NULL]
+  }
 
   return(combined)
 }

diff --git a/data/example_quantile_forecasts_only.rda b/data/example_quantile_forecasts_only.rda
diff --git a/data/example_truth_data_only.rda b/data/example_truth_data_only.rda
diff --git a/inst/create_example_data.R b/inst/create_example_data.R
@@ -0,0 +1,169 @@
+library(data.table)
+library(dplyr)
+library(devtools)
+
+# install package from github repository
+# devtools::install_github("epiforecasts/covid19.forecasts.uk")
+
+
+
+
+
+# create quantile example ------------------------------------------------------
+# load forecasts and do some filtering
+data <- covid19.forecasts.uk::uk_forecasts %>%
+  dplyr::mutate(horizon = as.numeric(value_date - creation_date),
+                quantile = round(quantile, 3)) %>%
+  dplyr::filter(model %in% c("EpiSoon", "SIRCOVID", "DetSEIRwithNB MCMC"),
+                creation_date > "2020-06-01",
+                geography %in% c("England", "Scotland", "Wales", "Northern Ireland"),
+                horizon %in% c(7, 14, 21)) %>%
+  dplyr::rename(prediction = value)
+
+# get available dates
+dates <- data$value_date %>%
+  unique()
+
+# load observations and keep a couple of weeks before any forecasts were made
+obs <- covid19.forecasts.uk::covid_uk_data %>%
+  dplyr::filter(value_date %in% c(as.Date(c("2020-06-08", "2020-06-01", "2020-05-25",
+                                            "2020-05-18", "2020-05-11", "2020-05-04")),
+                                  dates),
+                geography %in% c("England", "Scotland", "Wales", "Northern Ireland")) %>%
+  dplyr::rename(true_value = value)
+
+# save example data with forecasts only
+example_quantile_forecasts_only <- data
+usethis::use_data(example_quantile_forecasts_only, overwrite = TRUE)
+
+example_truth_data_only <- obs
+usethis::use_data(example_truth_data_only, overwrite = TRUE)
+
+
+# join
+quantile_example_data <- dplyr::left_join(obs, data) %>%
+  dplyr::mutate(model = as.character(model))
+data.table::setDT(quantile_example_data)
+# make model a character instead of a factor
+usethis::use_data(quantile_example_data, overwrite = TRUE)
+
+
+
+
+# create long range example ----------------------------------------------------
+range_example_data_long <- quantile_to_range_long(quantile_example_data,
+                                                 keep_quantile_col = FALSE)
+usethis::use_data(range_example_data_long, overwrite = TRUE)
+
+
+
+# create wide range example ----------------------------------------------------
+range_example_data_wide <- range_long_to_wide(range_example_data_long)
+range_example_data_wide[, NA_NA := NULL]
+usethis::use_data(range_example_data_wide, overwrite = TRUE)
+
+
+
+
+#create semi-wide range example ------------------------------------------------
+range_example_data_semi_wide <- data.table::copy(range_example_data_long)
+range_example_data_semi_wide <- data.table::dcast(range_example_data_semi_wide,
+                                                  ... ~ boundary,
+                                                  value.var = "prediction")
+range_example_data_semi_wide[, "NA" := NULL]
+usethis::use_data(range_example_data_semi_wide, overwrite = TRUE)
+
+
+
+# get continuous sample data ---------------------------------------------------
+# define gamma function
+fn_gamma <- function(par, x) {
+  quantiles <- as.numeric(names(x))
+  quantiles <- quantiles[!is.na(x)]
+  x <- x[!is.na(x)]
+  return(sum((qgamma(quantiles, shape = par[1], rate = par[2]) - x)**2))
+}
+
+# define function to fit gamma
+fit_gamma <- function(values, quantiles, init) {
+
+  x <- values
+  names(x) <- quantiles
+
+  if (missing(init)) {
+    init <- c(shape = 1, rate = 1)
+  }
+
+  res <- nloptr::sbplx(x0 = init, fn = fn_gamma, x = x,
+                       lower = c(shape = 0, rate = 0),
+                       control = list(xtol_rel = 1.0e-6, ftol_rel = 1.0e-6))
+  sol <- res$par
+  names(sol) <- names(init)
+
+  return(as.list(sol))
+}
+
+# function to obtain samples
+get_samples <- function(values, quantiles, n_samples = 1000) {
+  if (any(is.na(values))) {
+    return(NA_real_)
+  }
+  fit <- fit_gamma(values, quantiles)
+  samples <- rgamma(n = n_samples, rate = fit$rate, shape = fit$shape)
+}
+
+# calculate samples
+setDT(quantile_example_data)
+n_samples <- 50
+continuous_example_data <- quantile_example_data[, .(prediction = get_samples(prediction,
+                                                                              quantile,
+                                                                              n_samples = n_samples),
+                                                     sample = 1:n_samples,
+                                                     true_value = unique(true_value)),
+                                                 by = c("value_date", "value_type", "geography",
+                                                        "value_desc", "model", "creation_date",
+                                                        "horizon")]
+# remove unnecessary rows where no predictions are available
+continuous_example_data[is.na(prediction), sample := NA]
+continuous_example_data <- unique(continuous_example_data)
+usethis::use_data(continuous_example_data, overwrite = TRUE)
+
+
+# get integer sample data ------------------------------------------------------
+integer_example_data <- data.table::copy(continuous_example_data)
+integer_example_data <- integer_example_data[, prediction := round(prediction)]
+usethis::use_data(integer_example_data, overwrite = TRUE)
+
+
+
+
+
+# get binary example data ------------------------------------------------------
+# construct a binary prediction by looking at the number of samples below the
+# mean prediction. Construct the outcome as whether or not the actually
+# observed value was below or above that mean prediction.
+# Take this as a way to create example data, not as sound statistical practice
+
+binary_example_data <- data.table::copy(continuous_example_data)
+
+# store grouping variable
+by <-  c("value_date", "value_type", "geography", "value_desc",
+         "model", "creation_date", "horizon")
+
+# calculate mean value
+binary_example_data[, mean_val := mean(prediction),
+                    by = by]
+
+# calculate binary prediction as percentage above mean
+binary_example_data[, prediction := mean(prediction > mean_val),
+                    by = by]
+
+# calculate true value as whether or not observed was above mean
+binary_example_data[, true_value := true_value > mean_val]
+
+# delete unnecessary columns and take unique values
+binary_example_data[, `:=`(sample = NULL, mean_val = NULL,
+                           true_value = as.numeric(true_value))]
+binary_example_data <- unique(binary_example_data)
+usethis::use_data(binary_example_data, overwrite = TRUE)
+
diff --git a/man/eval_forecasts.Rd b/man/eval_forecasts.Rd
diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd