Better truncation handling (#44)

* implement xc * integral over delay * add lots of catches and debug modes * discover xc can be negative intentionally * test with many chains * relax truncation of mu * midpoint approx * get rid of the mid point and instead do the truncation outside the integral * rejig where we truncate * update approx * reformulate stan code * keep hacking * try dropping xc support * extend xc boundaries * add a test for the weibull sampling recovery for R code * enhance stan to R cross testing * refactor * update getting started * update news * package checking and turn off more exact approach * update check
epinowcast · Sep 6, 2024 · e648b4f · e648b4f
1 parent 74c2c8c
commit e648b4f
Show file tree

Hide file tree

Showing 14 changed files with 465 additions and 244 deletions.
diff --git a/.github/workflows/check-cmdstan.yaml b/.github/workflows/check-cmdstan.yaml
@@ -69,7 +69,7 @@ jobs:
           # We can't use TRUE here as pendatic check return lots of false
           # positives related to our use of functions.
           stopifnot(
-            length(message) > 11 &&
+            length(message) != 0 && length(message) < 12 &&
             any(message == "Stan program is syntactically correct")
           )
         shell: Rscript {0}
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,3 +17,4 @@ export(rpcens)
 export(rprimarycensoreddist)
 importFrom(stats,dunif)
 importFrom(stats,runif)
+importFrom(stats,setNames)
diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,10 @@ interface changes in `0.3.0` for the Stan code.
 * Added support for `swindow = 0` to `rprimarycensoreddist` to allow for non-secondary event censored distributions.
 * Adapted `rprimarycensoreddist` so that truncation is based on the primary censored distribution before secondary events are censored. This better matches the generative process.
 * Added a new Stan interface tool to enable finding which files functions are implemented in the Stan code.
+* Updated the approach to truncation to be outside the primary censored distribution integral.
+* Improved tests that compare random sampling and probability mass/density functions between R and Stan.
+* Improved cross testing between R and Stan implementations of the primary censored distributions.
+* Worked on improving the stability of the `primary_censored_dist_lpmf` when used for NUTS based fitting (i.e. in Stan).
 
 ## Documentation
 

diff --git a/R/dprimarycensoreddist.R b/R/dprimarycensoreddist.R
@@ -34,12 +34,29 @@
 #' \deqn{
 #' f_{\text{cens}}(d) = F_{\text{cens}}(d + \text{swindow}) - F_{\text{cens}}(d)
 #' }
-#' where \eqn{F_{\text{cens}}} is the primary event censored CDF. For the
-#' explanation and mathematical details of the CDF, refer to the documentation
-#' of [pprimarycensoreddist()].
+#' where \eqn{F_{\text{cens}}} is the primary event censored CDF.
+#'
+#' The function first computes the CDFs for all unique points (including both
+#' \eqn{d} and \eqn{d + \text{swindow}}) using [pprimarycensoreddist()]. It then
+#' creates a lookup table for these CDFs to efficiently calculate the PMF for
+#' each input value. For non-positive delays, the function returns 0.
+#'
+#' If a finite maximum delay \eqn{D} is specified, the PMF is normalized to
+#' ensure it sums to 1 over the range \[0, D\]. This normalization can be
+#' expressed as:
+#' \deqn{
+#' f_{\text{cens,norm}}(d) = \frac{f_{\text{cens}}(d)}{\sum_{i=0}^{D-1}
+#'  f_{\text{cens}}(i)}
+#' }
+#' where \eqn{f_{\text{cens,norm}}(d)} is the normalized PMF and
+#' \eqn{f_{\text{cens}}(d)} is the unnormalized PMF. For the explanation and
+#' mathematical details of the CDF, refer to the documentation of
+#' [pprimarycensoreddist()].
 #'
 #' @family primarycensoreddist
 #'
+#' @importFrom stats setNames
+#'
 #' @examples
 #' # Example: Weibull distribution with uniform primary events
 #' dprimarycensoreddist(c(0.1, 0.5, 1), pweibull, shape = 1.5, scale = 2.0)
@@ -65,19 +82,45 @@ dprimarycensoreddist <- function(
     )
   }
 
+  # Compute CDFs for all unique points
+  unique_points <- sort(unique(c(x, x + swindow)))
+  unique_points <- unique_points[unique_points > 0]
+  if (length(unique_points) == 0) {
+    return(rep(0, length(x)))
+  }
+
+  cdfs <- pprimarycensoreddist(
+    unique_points, pdist, pwindow, Inf, dprimary, dprimary_args, ...
+  )
+
+  # Create a lookup table for CDFs
+  cdf_lookup <- stats::setNames(cdfs, as.character(unique_points))
+
   result <- vapply(x, function(d) {
     if (d < 0) {
-      return(0) # Return log(0) for non-positive delays
+      return(0) # Return 0 for negative delays
+    } else if (d == 0) {
+      # Special case for d = 0
+      cdf_upper <- cdf_lookup[as.character(swindow)]
+      return(cdf_upper)
     } else {
-      pprimarycensoreddist(
-        d + swindow, pdist, pwindow, D, dprimary, dprimary_args, ...
-      ) -
-        pprimarycensoreddist(
-          d, pdist, pwindow, D, dprimary, dprimary_args, ...
-        )
+      cdf_upper <- cdf_lookup[as.character(d + swindow)]
+      cdf_lower <- cdf_lookup[as.character(d)]
+      return(cdf_upper - cdf_lower)
     }
   }, numeric(1))
 
+  if (is.finite(D)) {
+    if (max(unique_points) == D) {
+      cdf_D <- max(cdfs)
+    } else {
+      cdf_D <- pprimarycensoreddist(
+        D, pdist, pwindow, Inf, dprimary, dprimary_args, ...
+      )
+    }
+    result <- result / cdf_D
+  }
+
   if (log) {
     return(log(result))
   } else {

diff --git a/R/pprimarycensoreddist.R b/R/pprimarycensoreddist.R
@@ -41,7 +41,7 @@
 #'
 #' @details
 #' The primary event censored CDF is computed by integrating the product of
-#' the primary event distribution function (CDF) and the delay distribution
+#' the delay distribution function (CDF) and the primary event distribution
 #' function (PDF) over the primary event window. The integration is adjusted
 #' for truncation if a finite maximum delay (D) is specified.
 #'
@@ -50,15 +50,16 @@
 #' F_{\text{cens}}(q) = \int_{0}^{pwindow} F(q - p) \cdot f_{\text{primary}}(p)
 #' \, dp
 #' }
-#' where \eqn{F} is the CDF of the primary event distribution,
+#' where \eqn{F} is the CDF of the delay distribution,
 #' \eqn{f_{\text{primary}}} is the PDF of the primary event times, and
 #' \eqn{pwindow} is the primary event window.
 #'
-#' If the maximum delay \eqn{D} is finite, the CDF is normalized by \eqn{F(D)}:
+#' If the maximum delay \eqn{D} is finite, the CDF is normalized by dividing
+#' by \eqn{F_{\text{cens}}(D)}:
 #' \deqn{
-#' F_{\text{cens}}(q) = \int_{0}^{pwindow} \frac{F(q - p)}{F(D - p)} \cdot
-#' f_{\text{primary}}(p) \, dp
+#' F_{\text{cens,norm}}(q) = \frac{F_{\text{cens}}(q)}{F_{\text{cens}}(D)}
 #' }
+#' where \eqn{F_{\text{cens,norm}}(q)} is the normalized CDF.
 #'
 #' @family primarycensoreddist
 #'
@@ -82,28 +83,28 @@ pprimarycensoreddist <- function(
     if (d <= 0) {
       return(0) # Return 0 for non-positive delays
     } else {
-      if (is.infinite(D)) {
-        integrand <- function(p) {
-          d_adj <- d - p
-          pdist(d_adj, ...) *
-            do.call(
-              dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args)
-            )
-        }
-      } else {
-        integrand <- function(p) {
-          d_adj <- d - p
-          D_adj <- D - p
-          pdist(d_adj, ...) / pdist(D_adj, ...) *
-            do.call(
-              dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args)
-            )
-        }
+      integrand <- function(p) {
+        d_adj <- d - p
+        pdist(d_adj, ...) *
+          do.call(
+            dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args)
+          )
       }
+
       stats::integrate(integrand, lower = 0, upper = pwindow)$value
     }
   }, numeric(1))
 
+  if (!is.infinite(D)) {
+    # Compute normalization factor for finite D
+    normalizer <- if (max(q) == D) {
+      result[length(result)]
+    } else {
+      pprimarycensoreddist(D, pdist, pwindow, Inf, dprimary, dprimary_args, ...)
+    }
+    result <- result / normalizer
+  }
+
   return(result)
 }