Skip to content

Commit

Permalink
add functionality to pass forecasts and truth data separately into ev…
Browse files Browse the repository at this point in the history
…al_forecasts(), add approproriate example data and a script to generate the example data
  • Loading branch information
nikosbosse committed Jan 15, 2021
1 parent a6ae2ca commit 7664730
Show file tree
Hide file tree
Showing 12 changed files with 359 additions and 11 deletions.
41 changes: 41 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,44 @@
#' }
"binary_example_data"


#' Quantile Example Data - Forecasts only
#'
#' A data set with predictions for different quantities relevant in the
#' 2020 UK Covid-19 epidemic, but no true_values
#'
#' @format A data frame with 7,581 rows and 9 columns:
#' \describe{
#' \item{value_date}{the date for which a prediction was made}
#' \item{value_type}{the target to be predicted (short form)}
#' \item{geography}{the region for which a prediction was made}
#' \item{model}{name of the model that generated the forecasts}
#' \item{creation_date}{date on which the forecast was made}
#' \item{quantile}{quantile of the corresponding prediction}
#' \item{prediction}{quantile predictions}
#' \item{value_desc}{long form description of the prediction target}
#' \item{horizon}{forecast horizon in days}
#'
#' }
"example_quantile_forecasts_only"


#' Truth data only
#'
#' A data set with truth data for different quantities relevant in the
#' 2020 UK Covid-19 epidemic, but no predictions
#'
#' @format A data frame with 140 rows and 5 columns:
#' \describe{
#' \item{value_date}{the date for which a prediction was made}
#' \item{value_type}{the target to be predicted (short form)}
#' \item{geography}{the region for which a prediction was made}
#' \item{value_desc}{long form description of the prediction target}
#' \item{true_value}{true observed values}
#'
#' }
"example_truth_data_only"




29 changes: 24 additions & 5 deletions R/eval_forecasts.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@
#' @param summarised Summarise arguments (i.e. take the mean per group
#' specified in group_by. Default is TRUE.
#' @param verbose print out additional helpful messages (default is TRUE)
#' @param forecasts data.frame with forecasts, that should follow the same
#' general guidelines as the `data` input. Argument can be used to supply
#' forecasts and truth data independently. Default is `NULL`.
#' @param truth_data data.frame with a column called `true_value` to be merged
#' with `forecasts`
#' @param merge_by character vector with column names that `forecasts` and
#' `truth_data` should be merged on. Default is `NULL` and merge will be
#' attempted automatically.
#'
#' @return A data.table with appropriate scores. For binary predictions,
#' the Brier Score will be returned, for quantile predictions the interval
Expand Down Expand Up @@ -169,7 +177,7 @@
#' \url{https://doi.org/10.1371/journal.pcbi.1006785}
#' @export

eval_forecasts <- function(data,
eval_forecasts <- function(data = NULL,
by = NULL,
summarise_by = by,
metrics = NULL,
Expand All @@ -180,15 +188,26 @@ eval_forecasts <- function(data,
separate_results = TRUE),
pit_plots = FALSE,
summarised = TRUE,
verbose = TRUE) {
verbose = TRUE,
forecasts = NULL,
truth_data = NULL,
merge_by = NULL) {


# preparations ---------------------------------------------------------------
# check data argument is provided
if (!methods::hasArg("data")) {
stop("need arguments 'data'in function 'eval_forecasts()'")
if (is.null(data) && (is.null(truth_data) | is.null(forecasts))) {
stop("need arguments 'data' in function 'eval_forecasts()', or alternatively 'forecasts' and 'truth_data'")
}
if (is.null(data)) {
data <- merge_pred_and_obs(forecasts, truth_data, by = merge_by)
if (nrow(data) == 0) {
if (verbose) {
warning("After attempting to merge, only an empty data.table was left")
}
return(data)
}
}
check_not_null(data = data)

# do a copy to avoid that the input may be altered in any way.
data <- data.table::as.data.table(data)
Expand Down
3 changes: 2 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#' variable and the function call where the variable is missing. This function
#' is a helper function that should only be called within other functions
#' @param ... The variables to check
#' @importFrom base deparse1
#' @return The function returns `NULL`, but throws an error if the variable is
#' missing.
check_not_null <- function(...) {
Expand Down Expand Up @@ -77,6 +76,8 @@ globalVariables(c(".",
"boundary",
"brier_score",
"component_value",
"..colnames_x",
"..colnames_y",
"count",
"coverage_deviation",
"CRPS",
Expand Down
26 changes: 23 additions & 3 deletions R/utils_data_handling.R
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ sample_to_range <- function(data,
#' @export


merge_pred_and_obs <- function(forecasts, observations, by = NULL) {
merge_pred_and_obs <- function(forecasts, observations,
by = NULL) {

forecasts <- data.table::as.data.table(forecasts)
observations <- data.table::as.data.table(observations)
Expand All @@ -406,9 +407,28 @@ merge_pred_and_obs <- function(forecasts, observations, by = NULL) {
obs_cols <- colnames(observations)
by <- intersect(by, obs_cols)

combined <- merge(observations, forecasts, by = by)
# do a left_join, where all data in the observations are kept.
combined <- merge(observations, forecasts, by = by, all.x = TRUE)

# maybe add some error handling here
# get colnames that are the same for x and y
colnames <- colnames(combined)
colnames_x <- colnames[endsWith(colnames, ".x")]
colnames_y <- colnames[endsWith(colnames, ".y")]

# extract basenames
basenames_x <- sub(".x$", "", colnames_x)
basenames_y <- sub(".y$", "", colnames_y)

# see whether the column name as well as the content is the same
overlapping <- (as.list(combined[, ..colnames_x]) %in% as.list(combined[, ..colnames_y])) & basenames_x == basenames_y
overlap_names <- colnames_x[overlapping]
basenames_overlap <- sub(".x$", "", overlap_names)

# delete overlapping columns
if (length(basenames_overlap > 0)) {
combined[, paste0(basenames_overlap, ".x") := NULL]
combined[, paste0(basenames_overlap, ".y") := NULL]
}

return(combined)
}
Expand Down
Binary file added data/example_quantile_forecasts_only.rda
Binary file not shown.
Binary file added data/example_truth_data_only.rda
Binary file not shown.
169 changes: 169 additions & 0 deletions inst/create_example_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
library(data.table)
library(dplyr)
library(devtools)

# install package from github repository
# devtools::install_github("epiforecasts/covid19.forecasts.uk")





# create quantile example ------------------------------------------------------
# load forecasts and do some filtering
data <- covid19.forecasts.uk::uk_forecasts %>%
dplyr::mutate(horizon = as.numeric(value_date - creation_date),
quantile = round(quantile, 3)) %>%
dplyr::filter(model %in% c("EpiSoon", "SIRCOVID", "DetSEIRwithNB MCMC"),
creation_date > "2020-06-01",
geography %in% c("England", "Scotland", "Wales", "Northern Ireland"),
horizon %in% c(7, 14, 21)) %>%
dplyr::rename(prediction = value)

# get available dates
dates <- data$value_date %>%
unique()

# load observations and keep a couple of weeks before any forecasts were made
obs <- covid19.forecasts.uk::covid_uk_data %>%
dplyr::filter(value_date %in% c(as.Date(c("2020-06-08", "2020-06-01", "2020-05-25",
"2020-05-18", "2020-05-11", "2020-05-04")),
dates),
geography %in% c("England", "Scotland", "Wales", "Northern Ireland")) %>%
dplyr::rename(true_value = value)

# save example data with forecasts only
example_quantile_forecasts_only <- data
usethis::use_data(example_quantile_forecasts_only, overwrite = TRUE)

example_truth_data_only <- obs
usethis::use_data(example_truth_data_only, overwrite = TRUE)


# join
quantile_example_data <- dplyr::left_join(obs, data) %>%
dplyr::mutate(model = as.character(model))
data.table::setDT(quantile_example_data)
# make model a character instead of a factor
usethis::use_data(quantile_example_data, overwrite = TRUE)




# create long range example ----------------------------------------------------
range_example_data_long <- quantile_to_range_long(quantile_example_data,
keep_quantile_col = FALSE)
usethis::use_data(range_example_data_long, overwrite = TRUE)



# create wide range example ----------------------------------------------------
range_example_data_wide <- range_long_to_wide(range_example_data_long)
range_example_data_wide[, NA_NA := NULL]
usethis::use_data(range_example_data_wide, overwrite = TRUE)




#create semi-wide range example ------------------------------------------------
range_example_data_semi_wide <- data.table::copy(range_example_data_long)
range_example_data_semi_wide <- data.table::dcast(range_example_data_semi_wide,
... ~ boundary,
value.var = "prediction")
range_example_data_semi_wide[, "NA" := NULL]
usethis::use_data(range_example_data_semi_wide, overwrite = TRUE)



# get continuous sample data ---------------------------------------------------
# define gamma function
fn_gamma <- function(par, x) {
quantiles <- as.numeric(names(x))
quantiles <- quantiles[!is.na(x)]
x <- x[!is.na(x)]
return(sum((qgamma(quantiles, shape = par[1], rate = par[2]) - x)**2))
}

# define function to fit gamma
fit_gamma <- function(values, quantiles, init) {

x <- values
names(x) <- quantiles

if (missing(init)) {
init <- c(shape = 1, rate = 1)
}

res <- nloptr::sbplx(x0 = init, fn = fn_gamma, x = x,
lower = c(shape = 0, rate = 0),
control = list(xtol_rel = 1.0e-6, ftol_rel = 1.0e-6))
sol <- res$par
names(sol) <- names(init)

return(as.list(sol))
}

# function to obtain samples
get_samples <- function(values, quantiles, n_samples = 1000) {
if (any(is.na(values))) {
return(NA_real_)
}
fit <- fit_gamma(values, quantiles)
samples <- rgamma(n = n_samples, rate = fit$rate, shape = fit$shape)
}

# calculate samples
setDT(quantile_example_data)
n_samples <- 50
continuous_example_data <- quantile_example_data[, .(prediction = get_samples(prediction,
quantile,
n_samples = n_samples),
sample = 1:n_samples,
true_value = unique(true_value)),
by = c("value_date", "value_type", "geography",
"value_desc", "model", "creation_date",
"horizon")]
# remove unnecessary rows where no predictions are available
continuous_example_data[is.na(prediction), sample := NA]
continuous_example_data <- unique(continuous_example_data)
usethis::use_data(continuous_example_data, overwrite = TRUE)


# get integer sample data ------------------------------------------------------
integer_example_data <- data.table::copy(continuous_example_data)
integer_example_data <- integer_example_data[, prediction := round(prediction)]
usethis::use_data(integer_example_data, overwrite = TRUE)





# get binary example data ------------------------------------------------------
# construct a binary prediction by looking at the number of samples below the
# mean prediction. Construct the outcome as whether or not the actually
# observed value was below or above that mean prediction.
# Take this as a way to create example data, not as sound statistical practice

binary_example_data <- data.table::copy(continuous_example_data)

# store grouping variable
by <- c("value_date", "value_type", "geography", "value_desc",
"model", "creation_date", "horizon")

# calculate mean value
binary_example_data[, mean_val := mean(prediction),
by = by]

# calculate binary prediction as percentage above mean
binary_example_data[, prediction := mean(prediction > mean_val),
by = by]

# calculate true value as whether or not observed was above mean
binary_example_data[, true_value := true_value > mean_val]

# delete unnecessary columns and take unique values
binary_example_data[, `:=`(sample = NULL, mean_val = NULL,
true_value = as.numeric(true_value))]
binary_example_data <- unique(binary_example_data)
usethis::use_data(binary_example_data, overwrite = TRUE)

18 changes: 16 additions & 2 deletions man/eval_forecasts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/example_quantile_forecasts_only.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 7664730

Please sign in to comment.