fixed cfit bug

gdmcbain · Feb 26, 2024 · 1023881 · 1023881
1 parent 48b9c77
commit 1023881
Show file tree

Hide file tree

Showing 91 changed files with 509 additions and 450 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: NADA2
 Type: Package
 Title: Data Analysis for Censored Environmental Data
-Version: 1.1.5
-Date: 2023-09-19
+Version: 1.1.6
+Date: 2024-02-26
 Authors@R: c(person("Paul", "Julian",role=c("aut","cre"),
             email="[email protected]"),
             person("Dennis", "Helsel",role=c("aut","cph"),

diff --git a/NEWS.md b/NEWS.md
@@ -49,8 +49,18 @@
 
 * Minor bug fix for `cencorreg.R` function (2023-01-31)
 
-# NADA2 1.2.0
+# NADA2 1.1.4
 
 * Minor change to `censeaken.R` and `kenplot.R` to allow for some customization of plot using ... in the function (2023-04-13)
 
 * Minor change to `cboxplot.R` to fix factor grouping, added detection limit lty and col arguments (2023-04-13) 
+
+# NADA2 1.1.5
+
+* Minor update due to changes to `EnvStats` (2023-10-20)
+
+# NADA2 1.1.6
+
+* Fixed bug in `cfit.R` regarding reporting of quantiles (2024-02-23)
+
+* adjusted functions that depend on `cfit(...)` like `cen1way(...)` 
diff --git a/R/cen1way.R b/R/cen1way.R
@@ -70,8 +70,8 @@ cen1way <- function(x1, x2, group, mcomp.method = "BH", printstat=TRUE) {
     y1gp <- y1[Factor==groupnames[i]]
     y2gp <- y2[Factor==groupnames[i]]
     Cstats <- suppressWarnings(cfit(y1gp, as.logical(y2gp), printstat=FALSE, Cdf = FALSE))
-    Cstats <- Cstats[c(1:6)]
-    Cstats <- Cstats[-3]
+    Cstats <- Cstats[[1]][c(1:6)] ## summary stats
+    Cstats <- Cstats[-3] ## N, PcfND, Mean, SD and LCLmean only
     Cstats <- data.frame(Cstats)
     Cstats$grp <-  groupnames[i] # added to include group in summary data frame.
     rownames(Cstats) <-NULL # added to clean up row names.

diff --git a/R/cfit.R b/R/cfit.R
@@ -7,29 +7,30 @@
 #' @param qtls Probabilities for the quantiles to be estimated.  Defaults are (0.10, 0.25, 0.50, 0.75, 0.90).  You may add and/or substitute probabilities -- all must be between and not including 0 to 1.
 #' @param Cdf Logical `TRUE`/`FALSE` indicator of whether to plot the empirical cumulative distribution function (cdf).
 #' @param ecdf.col Color for the ecdf plotted step function line.  Default is black.
-#' @param km.orig  If `TRUE`, Kaplan-Meier results in the realm below detection limits reported as "NA".  If `FALSE` (default), information in the detection limits is used and results in the realm of detections limits reported as "< DL", where DL is the appropriate detection limit.
+#' @param km.orig  If `TRUE` (default), Kaplan-Meier results in the realm below detection limits reported as "NA".  If `FALSE`, information in the detection limits is used and results in the realm of detection limits reported as "< DL", where DL is the appropriate detection limit.
 #' @param printstat Logical `TRUE`/`FALSE` option of whether to print the resulting statistics in the console window, or not.  Default is `TRUE.`
 #' @param Ylab Optional input text in quotes to be used as the variable name on the ecdf plot.  The default is the name of the `y1` input variable.
-#' @param plot.pos numeric scalar between 0 and 1 containing the value of the plotting position constant. The default value is `plot.pos=0.375`, the Blom plotting position
-#' @param q.type an integer between 1 and 9 selecting one of the nine quantile algorithms detailed below to be used. See `stats::quantile` for more detail, default is set to 7.
+#' @param q.type an integer between 1 and 9 selecting one of the nine quantile algorithms used only when km.orig = FALSE. See `stats::quantile` and Details below for more detail, default when km.orig = FLASE is set to 6.
 #' @importFrom survival Surv survfit
 #' @importFrom stats quantile
 #' @return
 #' If `printstat=TRUE`: Based on the provided `conf` value, Kaplan-Meier summary statistics (`mean`,`sd`,`median`), lower and upper confidence intervals around the mean and median value, sample size and percent of censored samples are returned. The specified quantile values are also printed and returned.
 #'
 #' If `Cdf=TRUE`: The ecdf of censored data is plotted.
-#' @details Moment statistics are estimated using the enparCensored function of the EnvStats package. This avoids a small bias in the mean produced by the NADA package's cenfit function, which uses the reverse Kaplan-Meier procedure, converting left-censored to right-censored data prior to computing the ecdf and mean. See Gillespie et al. for more discussion on the bias of the estimate of the mean.
+#' @details Moment statistics are estimated using the enparCensored function of the EnvStats package. This avoids a small bias in the mean produced by the NADA package's cenfit function, which uses the reverse Kaplan-Meier procedure, converting left-censored to right-censored data prior to computing the ecdf and mean. See Gillespie et al.(2010) for more discussion on the bias of the estimate of the mean.
 #'
-#' Quantiles and confidence limits on the median are estimated using the quantile function of the survfit command. See ?quantiles for choosing the q.type; default q.type = 7.
+#' Quantiles and their two-sided confidence limits are estimated using the quantile function of the survfit command. See ?quantiles or Helsel et al. (2020) for choosing the q.type; default q.type = 4 (Kaplan-Meier; prob = i/n) when km.orig = TRUE. This is standard procedure in the survival analysis discipline and in the survival package of R, and is also used by the cenfit function in the NADA package. While this is 'industry standard' in medical applications it is a poor choice for observed sample (rather than census) data because it means that the largest observation is assigned a probability equal to 1, the 100th percentile. This implies that this value is never expected to be exceeded when more sample data are collected. It also means the largest observation is not plotted on the ecdf in most software because it is "off the chart". For small datasets in particular it is unlikely that the current largest observation is the largest value in the population and so the resulting ecdf quantiles are likely not opotimal. The default q.type = 6 (Weibull; prob = i/(n+1)) when km.orig = FALSE, though that may be changed by the user. the largest observation plots at a probability less than 1 on the ecdf.  Differences in results when differing q.types are used will decrease as sample size increases.
 #'
-#' All printed values will also be output to an object if saved.  Values are character because of the possibility of a `<1`, but if no `<` symbol can be converted to numeric using the `as.numeric(...)` command.  For data without censoring cfit will also return values.  For data without censoring cfit will also return values.  In that case it returns standard arithmetic mean, standard deviation and quantiles instead of K-M versions.
+#' All printed values will also be output to an object if saved. Confidence intervals on the quantiles are also output when data include nondetects. Values are character because of the possibility of a `<1`, but if no `<` symbol can be converted to numeric using the `as.numeric(...)` command.  For data without censoring cfit will return numeric values. In that case it returns standard arithmetic mean, standard deviation and quantiles instead of K-M versions.
 #'
 #' @export
 #' @references
 #' Helsel, D.R., 2011. Statistics for Censored Environmental Data using Minitab and R, 2nd ed. John Wiley & Sons, USA, N.J.
 #'
 #' Gillespie, B.W., et al., 2010.  Estimating Population Distributions When Some Data Are Below a Limit of Detection by Using a Reverse Kaplan-Meier Estimator. Epidemiology 21, 564-570.
 #'
+#' Helsel, D.R., Hirsch, R.M., Ryberg, K.R., Archfield, S.A., and Gilroy, E.J., 2020, Statistical Methods in Water Resources: U.S. Geological Survey Techniques and Methods, book 4, chapter A3, 458 p., https://doi.org/10.3133/tm4a3.
+#'
 #' Millard, S.P, 2013. EnvStats: An R Package for Environmental Statistics, 2nd ed. Springer Science+Business Media, USA, N.Y.  DOI 10.1007/978-1-4614-8456-1© Springer Science+Business Media New York 2013”
 #'
 #' Excerpt From: Steven P. Millard. “EnvStats.” Apple Books.
@@ -42,8 +43,10 @@
 #' cfit(Brumbaugh$Hg,Brumbaugh$HgCen)
 #'
 
-cfit <- function(y, cens, conf=0.95, qtls = c(0.10, 0.25, 0.50, 0.75, 0.90), plot.pos = 0.375, q.type = 7, Cdf = TRUE, ecdf.col = 1, km.orig = FALSE, printstat = TRUE, Ylab = NULL)
-{  if (is.null(Ylab)) {Ylab <- deparse(substitute(y))}
+cfit <- function(y, cens, conf=0.95, qtls = c(0.10, 0.25, 0.50, 0.75, 0.90),
+                 q.type = 6, Cdf = TRUE, ecdf.col = 1, km.orig = TRUE,
+                 printstat = TRUE, Ylab = NULL){
+  if (is.null(Ylab)) {Ylab <- deparse(substitute(y))}
   Conf <- 100*conf
   if (isFALSE(0.50 %in% qtls)) {qtls <- append(0.50, qtls)}     # median always needed
   k <- length(qtls)
@@ -91,58 +94,67 @@ cfit <- function(y, cens, conf=0.95, qtls = c(0.10, 0.25, 0.50, 0.75, 0.90), plo
   UCLmean <- signif(KMmean + qt.ci[2]*std.err, 4)
   KMmean <- signif(KMmean, 4)
 
-  # using EnvStats::ecdfPlotCensored produces the plot and some info for computing quantiles.
-  # quantiles are computed later using the quantile function
-  ecdf.out <- ecdfPlotCensored(y1, y2, plot.it = Cdf, plot.pos.con = plot.pos, prob.method = "kaplan-meier", type = "s", ecdf.col = ecdf.col, xlab = Ylab, main = "Empirical CDF of Censored Data", ecdf.lwd = par("cex"))
+  # using EnvStats::ecdfPlotCensored produces the plot and computes quantiles for Kaplan-Meier.
+  ecdf.out <- ecdfPlotCensored(y1, y2, plot.it = Cdf, plot.pos.con = 0.375, prob.method = "kaplan-meier", type = "s", ecdf.col = ecdf.col, xlab = Ylab, main = "Empirical CDF of Censored Data", ecdf.lwd = par("cex"))
+
   for (i in 1:k)  {ecdf.qtls[i] <- min(ecdf.out$Cumulative.Probabilities[ecdf.out$Cumulative.Probabilities >= qtls[i]])
-  # Above gives the same as the quantile function
   ecdf.ranks[i] <- min(which(ecdf.out$Cumulative.Probabilities == ecdf.qtls[i]))
   ecdf.yvals[i] <- nonas[,1][ecdf.ranks[i]]
-  # using ecdf for quantiles gives the lowest DL for a series of censored data.
-  # the quantile function below gives the highest DL for a given cumulative probability. This is what should be used for a "<DL" result.
+  # these are standard Kaplan-Meier quantiles
+  # gives the lowest DL for a series of censored data.
   }
 
-  # print(ecdf.out); print(ecdf.ranks); print (ecdf.yvals)
-
-  # use quantile function below to get standard Kaplan-Meier quantiles with NAs for low-level results.
-  # get quantiles and conf int on median by quantile function on a surv object.
-  # requires unflipping the results.The 3 surv.quantiles variables end up correct.
-  surv.quantiles <- quantile(y.out, probs = qtls, conf.int = TRUE)
-  surv.quantiles$quantile <-  rev(flip.const - surv.quantiles$quantile)
-  names(surv.quantiles$quantile) <- paste (as.character(100*(qtls[1:k])), sep = "")
-  stash.upper <- surv.quantiles$upper
-  surv.quantiles$upper <-  rev(flip.const - surv.quantiles$lower)
-  names(surv.quantiles$upper) <- paste (as.character(100*(qtls[1:k])), sep = "")
-  surv.quantiles$lower <-  rev(flip.const - stash.upper)
-  names(surv.quantiles$lower) <- paste (as.character(100*(qtls[1:k])), sep = "")
-  KMmedian <- signif(surv.quantiles$quantile[median.pos], 4)
-  Qtls <- as.character(signif(surv.quantiles$quantile, 4))
-  KMmed <- surv.quantiles$quantile[median.pos]  # possibly == NA
-  LCLmed <- signif(surv.quantiles$lower[median.pos], 4)  # possibly == NA
-  UCLmed <- signif(surv.quantiles$upper[median.pos], 4)   # possibly == NA
-  LCLmedian <- LCLmed
-  UCLmedian <- UCLmed
-
-  # NAs in quantiles changed to the highest detection limit for a series of DLs with idential cumulative probabilites.  Not standard K-M but provides more info.
+  # get conf int on median by quantile function on a surv object (y.out).
+  # requires unflipping the results using 1-qtls for probabilities.
+  surv.median <- quantile(y.out, probs = 0.5, conf.int = TRUE)
+  names(ecdf.yvals) <- paste (as.character(100*(qtls[1:k])), sep = "")
+  surv.quantiles <- quantile(y.out, probs = 1-qtls, conf.int = TRUE, type=q.type)
+  Qtiles <- flip.const - surv.quantiles$quantile    # numeric
+
+  KMmedian <- signif(ecdf.yvals[median.pos], 4)
+  Qtls <- as.character(signif(ecdf.yvals, 4))       # character
+
+  LCLmedian <- signif(flip.const - surv.median$upper, 4)  # possibly == NA
+  UCLmedian <- signif(flip.const - surv.median$lower, 4)   # possibly == NA
+  Lower <- signif(flip.const - surv.quantiles$upper, 4)        # numeric
+  names(Lower) <- paste (as.character(100*(qtls[1:k])), sep = "")
+  Upper <- signif(flip.const - surv.quantiles$lower, 4)       # numeric
+  names(Upper) <- paste (as.character(100*(qtls[1:k])), sep = "")
+
+  # NAs in quantiles changed to the highest detection limit for a series of DLs with idential cumulative probabilites.  Provides more info than standard K-M.
+  # the quantile function below gives the highest DL for a given cumulative probability. This is what should be used for a "<DL" result.
   if (km.orig == FALSE) {
-    for (i in 1:k) {
-      max.tied[i] <- max(which(ecdf.out$Cumulative.Probabilities == ecdf.qtls[i]))
-      Qtls[i] <- signif(surv.quantiles$quantile[i])
-      if (is.na(surv.quantiles$quantile[i])) {Qtls[i] <- nonas[,1][max.tied[i]]
-      Qtls[i] <- paste("<", Qtls[i], sep="")}
-    }
-    # getting <DL values for Conf Intervals on median
-    KMmedian <- Qtls[median.pos]
-    if (is.na(LCLmed)) {
-      if (is.na(KMmed)) {  LCLmedian <- KMmedian  }
-      else { lcl.tied <- nonas[,1][max.tied[1]]
-      LCLmedian <- paste("<", lcl.tied, sep="") }
-    }
-
-    UCLmedian <- as.character(UCLmed)
-    if (is.na(UCLmed)) { UCLmedian <- "unknown"  }
-  }   # end of enhanced output when km.orig == FALSE
-  }  # end of data include nondetects
+    surv.quantiles <- quantile(y.out, probs = 1-qtls, conf.int = TRUE, type=q.type)
+    Qtiles <- flip.const - surv.quantiles$quantile    # numeric
+    names(Qtiles) <- paste (as.character(100*(qtls[1:k])), sep = "")
+    Lower <- signif(flip.const - surv.quantiles$upper, 4)        # numeric
+    names(Lower) <- paste (as.character(100*(qtls[1:k])), sep = "")
+    Upper <- signif(flip.const - surv.quantiles$lower, 4)       # numeric
+    names(Upper) <- paste (as.character(100*(qtls[1:k])), sep = "")
+    Qtls <- as.character(signif(Qtiles, 4))    #character
+    KMmedian <- signif(Qtiles[median.pos], 4)    #character
+    LCLmedian <- as.character(signif(Lower[median.pos], 4))  # character, possibly == NA
+    UCLmedian <- as.character(signif(Upper[median.pos], 4))   # character, possibly == NA
+
+    for (i in 1:k) {max.tied[i] <- max(which(ecdf.out$Cumulative.Probabilities == ecdf.qtls[i]))
+
+    if (is.na(Qtiles[i])) {Qtiles[i] <- nonas[,1][max.tied[i]]
+    Qtls[i] <- paste("<", Qtiles[i], sep="")
+    KMmedian <- Qtls[median.pos]}
+
+    # getting <DL values for Lower CI endpoints
+    if(is.na(Lower[i])) {Lower[i]<- max(DLs[DLs <= Qtiles[i]])
+    Lower[i] <- as.character(paste("<", Lower[i], sep="") )
+    LCLmedian <- Lower[median.pos] }
+
+    # getting reality for upper CI endpoints
+    if (is.na(Upper[i])) {
+      Upper[i] <- "unknown"}
+    }  # end of loop for establishing quantiles and CIs if <DL
+    UCLmedian <- Upper[median.pos]
+  }   # end of when km.orig == FALSE
+  #  print(nonas[,1]); print(nonas[,2]); print(ecdf.ranks); print(max.tied)
+  }  # end of data that include nondetects
 
   # when all data are detects
   else  {Mean <- mean(y1, na.rm = TRUE)
@@ -156,7 +168,7 @@ cfit <- function(y, cens, conf=0.95, qtls = c(0.10, 0.25, 0.50, 0.75, 0.90), plo
   SD <- signif(SD, 4)
   Qtls <- signif(quantile(y1, probs = qtls, na.rm = TRUE, type = q.type), 4)
   Median <-  signif(quantile(y1, probs = 0.50, na.rm = TRUE, type = q.type), 4)
-  ecdf.out <- ecdfPlot(y1, plot.it = Cdf, plot.pos.con = plot.pos, type = "s", xlab = Ylab, main = paste("Empirical CDF of", deparse(substitute(y))), ecdf.col = ecdf.col, ecdf.lwd = par("cex"))
+  ecdf.out <- ecdfPlot(y1, plot.it = Cdf, plot.pos.con = 0.375, type = "s", xlab = Ylab, main = paste("Empirical CDF of", deparse(substitute(y))), ecdf.col = ecdf.col, ecdf.lwd = par("cex"))
 
   # all data are detects: getting CIs for median
   orderstat.CI <- quantile_CI (N, 0.50, alpha = 1-conf)
@@ -181,8 +193,9 @@ cfit <- function(y, cens, conf=0.95, qtls = c(0.10, 0.25, 0.50, 0.75, 0.90), plo
   }
 
   names(Qtls) <- quant.char
-  Qtls <- t(Qtls)
-  cstats <- cbind(stats, Qtls)
+  if (sum(y2) != 0) {Qtls.out <- cbind(Lower, Qtls, Upper)}
+  else {Qtls.out <- Qtls }
+  cstats <- list(stats, Qtls.out)
   return (invisible (cstats))
 }
 

diff --git a/build.R b/build.R
@@ -49,7 +49,7 @@ devtools::check(vignettes=F)
 pkgdown::build_site()
 
 ## Builds package
-# devtools::build(vignettes=F)
+devtools::build(vignettes=F)
 # devtools::build()
 
 # -------------------------------------------------------------------------

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html