allow missing y

ConnorDonegan · Mar 26, 2024 · 7c7bfac · 7c7bfac
1 parent 5552796
commit 7c7bfac
Show file tree

Hide file tree

Showing 82 changed files with 1,621 additions and 2,004 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ cran-comments.md
 /doc/
 /Meta/
 CRAN-RELEASE
+CRAN-SUBMISSION
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: geostan
 Title: Bayesian Spatial Analysis
-Version: 0.5.4
+Version: 0.6.0
 Date: 2024-03-01
 URL: https://connordonegan.github.io/geostan/
 BugReports: https://github.com/ConnorDonegan/geostan/issues

diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,16 @@
+# geostan 0.6.0
+
+## New Additions
+
+The model fitting functions (`stan_glm`, `stan_car`, etc.) now allow for missing data in the outcome variable and a new vignette provides the details. This functionality is not available for auto-Gaussian models - that is, CAR and SAR models that have been fit to continuous outcome variables - but is available for all other available models (including eigenvector spatial filtering `stan_esf` models for continuous outcomes, and all models for count outcomes [binomial and Poisson models]).
+
 # geostan 0.5.4
 
 Minor updates to the vignettees and documentation, also re-compiled geostan models using the latest StanHeaders (fixing an error on CRAN).
 
 # geostan 0.5.3
 
-### Minor changes
+## Minor changes
 
 The `gamma` function (which is available to help set prior distributions) has been renamed to `geostan::gamma2` to avoid conflict with `base::gamma`. 
 

diff --git a/R/convenience-functions.R b/R/convenience-functions.R
@@ -1306,9 +1306,11 @@ exp_pars <- function(formula, data, C) {
     MCM <- M %*% C %*% M
     eigens <- eigen(MCM, symmetric = TRUE)
     npos <- sum(eigens$values > 0)
-    sa <- mc(residuals(lm(formula, data = data)), C)
+    res <- residuals(lm(formula, data = data))
+    obs_idx <- which(!is.na(res))
+    sa <- mc(res[obs_idx], C[obs_idx, obs_idx])
     X <- model.matrix(formula, data)
-    E_sa <- expected_mc(X, C)
+    E_sa <- expected_mc(X, C[obs_idx, obs_idx])
     Sigma_sa <- sqrt( 2 / nlinks )
     z_sa <- (sa - E_sa) / Sigma_sa
     if (z_sa < -.59) {

diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R
@@ -32,7 +32,7 @@
 #' @method print geostan_fit
 #' @rdname print_geostan_fit
 print.geostan_fit <- function(x,
-                              probs = c(0.025, 0.25, 0.5, 0.75, 0.975),
+                              probs = c(0.025, 0.2, 0.5, 0.8, 0.975),
                               digits = 3,
                               pars = NULL, ...) {
   pars <- c(pars, "intercept")
@@ -61,7 +61,7 @@ print.geostan_fit <- function(x,
   cat("Link function: ", x$family$link, "\n")
   cat("Residual Moran Coefficient: ", x$diagnostic["Residual_MC"], "\n")
   if (!is.na(x$diagnostic["WAIC"])) cat("WAIC: ", x$diagnostic["WAIC"], "\n")
-  cat("Observations: ", nrow(x$data), "\n")  
+  cat("Observations: ", x$N, "\n")  
   cat("Data models (ME): ")
   if (x$ME$has_me) {
       cat(paste(names(x$ME$se), sep = ", ")) 

diff --git a/R/internal-functions.R b/R/internal-functions.R
@@ -79,7 +79,6 @@ SLX <- function(f, DF, x, W) {
 #' @param y_mis_idx Index of censored counts in the outcome, if any.
 make_data <- function(frame, x, y_mis_idx) {
     y <- model.response(frame)
-    if (length(y_mis_idx) > 0) y[y_mis_idx] <- NA
     offset <- model.offset(frame)
     if (is.null(offset)) return(cbind(y, x))
     return(cbind(y, offset, x))
@@ -127,25 +126,37 @@ inv_logit <- function(x) exp(x)/(1 + exp(x))
 #' 
 #' @importFrom stats sd
 #' @noRd
-make_priors <- function(user_priors = NULL, y, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) {
+make_priors <- function(user_priors = NULL, y, trials, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) {
     link <- match.arg(link)
     if (link == "identity") {
-        scale.y <- sd(y)
+        y <- na.omit(y)
+        scale.y <- max(sd(y), .1)
         alpha_scale <- max(4 * sd(y), 5)
         alpha_mean <- mean(y)
     }
   if (link == "log") {
       if (any(y == 0)) y[which(y == 0)] <- 1 # local assignment only, not returned
       y <- log(y / exp(offset))
+      if (any(is.infinite(y))) {
+          any_inf <- which(is.infinite(y))                
+          y[any_inf] <- NA
+      }
+      y <- na.omit(y)      
       alpha_mean <- mean(y)
-      scale.y <- sd(y)
+      scale.y <- max(sd(y), .1, na.rm = T)
       alpha_scale <- max(4 * scale.y, 5)
   }
-  if (link == "logit") {
-      y <- y[,1] / (y[,1] + y[,2])
-      alpha_mean <- 0
-      scale.y <- sd(y)
-      alpha_scale <- 5
+    if (link == "logit") {
+      p <- y / trials
+      y <- log( p / (1-p))
+      if (any(is.infinite(y))) {
+          any_inf <- which(is.infinite(y))                
+          y[any_inf] <- NA
+      }
+      y <- na.omit(y)            
+      alpha_mean <- mean(y)
+      scale.y <- max(sd(y), 0.1)
+      alpha_scale <- max(4 * scale.y, 5)
   }
     priors <- list()
     priors$intercept <- normal(location = alpha_mean, scale = alpha_scale)

diff --git a/R/prep-censored-data.R b/R/prep-censored-data.R
@@ -1,11 +1,6 @@
 
 
 
-
-
-
-
-
 #' @description Return index of observed, censored y; elsewhere, use results to replace NAs with zeros
 #' This will stop if there are missing values and the censor_point argument is not being used; outside of this call, must check that censor_point argument is only used with Poisson likelihood.
 #' 
@@ -14,10 +9,12 @@
 #' @noRd
 #' 
 handle_censored_y <- function(censor, frame) {
-    y_raw <- as.numeric(model.response(frame))
+    y_raw <- model.response(frame)
+    if (inherits(y_raw, "matrix")) {
+        y_raw <- y_raw[,1] + y_raw[,2]
+    }
     y_mis_idx <- which(is.na(y_raw))
     y_obs_idx <- which(!is.na(y_raw))
-    if (censor == FALSE & length(y_mis_idx) > 0) stop("Missing values in y. If these are censored counts, you may want to use the censor_point argument.")
     return (list(n_mis = length(y_mis_idx),
                 n_obs = length(y_obs_idx),
                 y_mis_idx = y_mis_idx,
@@ -29,5 +26,5 @@ handle_censored_y <- function(censor, frame) {
 #' @noRd
 handle_missing_x <- function(frame) {
     check_NAs <- apply(as.matrix(frame[,-1]), 2, function(c) any(is.na(c)))
-    if (any(check_NAs)) stop("Missing values found in covariates or offset term. Even when using the censor_point argument, missing values are only permitted in the outcome variable.")
+    if (any(check_NAs)) stop("Missing values found in covariates or offset term. Missing values are only permitted in the outcome variable.")
 }
diff --git a/R/stan_car.R b/R/stan_car.R
@@ -250,7 +250,7 @@ stan_car <- function(formula,
     if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "CAR")
     stopifnot(!missing(data))
     check_car_parts(car_parts)
-    stopifnot(length(car_parts$Delta_inv) == nrow(data))
+    stopifnot(car_parts$n == nrow(data))
     if (!missing(C)) {
         stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
         stopifnot(all(dim(C) == nrow(data)))
@@ -267,13 +267,25 @@ stan_car <- function(formula,
     if (missing(censor_point)) censor_point <- FALSE
     mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
     handle_missing_x(mod_frame)
+    ## prep Y for missingness //
     y_index_list <- handle_censored_y(censor_point, mod_frame)
-    y <- y_int <- model.response(mod_frame)
-    ## CAR: INCLUDE AUTO-GAUSSIAN AS family_int=5 -------------      
-    if (family_int %in% c(1,2,5)) y_int <- rep(0, length(y))
-    ## -------------      
-    y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
-    mod_frame[y_index_list$y_mis_idx, 1] <- 0 
+    mi_idx <- y_index_list$y_mis_idx
+    y_tmp <- model.response(mod_frame)
+    if (family$family == "binomial") {
+        trials <- y_tmp[,1] + y_tmp[,2]
+        y <- y_int <- y_tmp[,1]
+    }
+    if (family$family == "poisson") {
+        y <- y_int <- y_tmp        
+        trials <- rep(0, n)   
+    }
+    if (family$family %in% c("gaussian", "auto_gaussian")) {
+        y <- y_tmp
+        y_int <- trials <- rep(0, n)
+        if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.")
+    }    
+    y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
+    ## //    
     if (is.null(model.offset(mod_frame))) {
         offset <- rep(0, times = n)
     } else {
@@ -339,7 +351,7 @@ stan_car <- function(formula,
         prior_only = prior_only,        
         y = y,
         y_int = y_int,
-        trials = rep(0, length(y)),
+        trials = trials,
         #n = n, # getting n from car_parts, below
         input_offset = offset,
         has_re = has_re,
@@ -359,10 +371,11 @@ stan_car <- function(formula,
     ## PRIORS & CAR DATA -------------  
     is_student <- FALSE ##family$family == "student_t"
     priors_made <- make_priors(user_priors = prior,
-                               y = y,
+                               y = y[y_index_list$y_obs_idx],
+                               trials = trials[y_index_list$y_obs_idx],
                                x = x_full,
                                link = family$link,
-                               offset = offset)
+                               offset = offset[y_index_list$y_obs_idx])    
     if (is.null(prior$car_scale)) {
         priors_made$car_scale <- priors_made$sigma        
     } else {
@@ -390,11 +403,6 @@ stan_car <- function(formula,
     # append me.list to standata
     standata <- c(standata, me.list)
 
-    ## INTEGER OUTCOMES -------------    
-    if (family$family == "binomial") {
-        standata$y <- standata$y_int <- y[,1]
-        standata$trials <- y[,1] + y[,2]
-    }
     ## PARAMETERS TO KEEP, with CAR PARAMETERS [START] -------------            
     pars <- c(pars, 'intercept', 'car_scale', 'car_rho', 'fitted', 'log_lik')
     if (family_int < 5) pars <- c(pars, 'log_lambda_mu') 
@@ -451,6 +459,7 @@ stan_car <- function(formula,
         R <- resid(out, summary = FALSE)
         out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
     }
+    out$N <- length( y_index_list$y_obs_idx )    
     return (out)
 }
 
diff --git a/R/stan_esf.R b/R/stan_esf.R
@@ -229,11 +229,24 @@ stan_esf <- function(formula,
     if (missing(censor_point)) censor_point <- FALSE
     mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
     handle_missing_x(mod_frame)
+    ## prep Y for missingness //
     y_index_list <- handle_censored_y(censor_point, mod_frame)
-    y <- y_int <- model.response(mod_frame)
-    if (family_int %in% c(1,2)) y_int <- rep(0, length(y))
-    y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
-    mod_frame[y_index_list$y_mis_idx, 1] <- 0 
+    mi_idx <- y_index_list$y_mis_idx
+    y_tmp <- model.response(mod_frame)
+    if (family$family == "binomial") {
+        trials <- y_tmp[,1] + y_tmp[,2]
+        y <- y_int <- y_tmp[,1]
+    }
+    if (family$family == "poisson") {
+        y <- y_int <- y_tmp        
+        trials <- rep(0, n)
+    }
+    if (family$family %in% c("gaussian", "student_t")) {
+        y <- y_tmp
+        y_int <- trials <- rep(0, n)
+    }    
+    y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
+    ## //    
     if (is.null(model.offset(mod_frame))) {
         offset <- rep(0, times = n)
     } else {
@@ -289,7 +302,7 @@ stan_esf <- function(formula,
         prior_only = prior_only,    
         y = y,
         y_int = y_int,
-        trials = rep(0, length(y)),
+        trials = trials,
         n = n,
         input_offset = offset,
         has_re = has_re,
@@ -316,12 +329,13 @@ stan_esf <- function(formula,
         hs_global_scale <- min(1, p0/(dev-p0) / sqrt(n))
     }
     priors_made <- make_priors(user_priors = prior,
-                               y = y,
+                               y = y[y_index_list$y_obs_idx],
+                               trials = trials[y_index_list$y_obs_idx],
                                x = x_full,
                                hs_global_scale = hs_global_scale,
                                EV = EV,                               
                                link = family$link,
-                               offset = offset)
+                               offset = offset[y_index_list$y_obs_idx])    
     standata <- append_priors(standata, priors_made)
     esf_dl <- list(
         dev = dev,
@@ -337,11 +351,6 @@ stan_esf <- function(formula,
     ## ME MODEL STUFF -------------  
     me.list <- make_me_data(ME, xraw)
     standata <- c(standata, me.list)
-    ## INTEGER OUTCOMES -------------    
-    if (family$family == "binomial") {
-      standata$y <- standata$y_int <- y[,1]
-      standata$trials <- y[,1] + y[,2]
-  }
     ## PARAMETERS TO KEEP -------------          
     pars <- c(pars, 'intercept', 'esf', 'beta_ev', 'log_lik', 'fitted')
     if (!intercept_only) pars <- c(pars, 'beta')
@@ -393,6 +402,7 @@ stan_esf <- function(formula,
         R <- resid(out, summary = FALSE)
         out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
     }
+    out$N <- length( y_index_list$y_obs_idx )    
   return (out)
 }
 

diff --git a/R/stan_glm.R b/R/stan_glm.R
@@ -244,11 +244,24 @@ stan_glm <- function(formula,
     if (missing(censor_point)) censor_point <- FALSE
     mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
     handle_missing_x(mod_frame)
+    ## prep Y for missingness //
     y_index_list <- handle_censored_y(censor_point, mod_frame)
-    y <- y_int <- model.response(mod_frame)
-    if (family_int %in% c(1,2)) y_int <- rep(0, length(y))
-    y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
-    mod_frame[y_index_list$y_mis_idx, 1] <- 0 
+    mi_idx <- y_index_list$y_mis_idx  
+    y_tmp <- model.response(mod_frame)
+    if (family$family == "binomial") {
+        trials <- y_tmp[,1] + y_tmp[,2]
+        y <- y_int <- y_tmp[,1]
+    }
+    if (family$family == "poisson") {
+        y <- y_int <- y_tmp        
+        trials <- rep(0, n)
+    }
+    if (family$family %in% c("gaussian", "student_t")) {
+        y <- y_tmp
+        y_int <- trials <- rep(0, n)
+    }    
+    y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
+    ## //
     if (is.null(model.offset(mod_frame))) {
         offset <- rep(0, times = n)
     } else {
@@ -314,7 +327,7 @@ stan_glm <- function(formula,
     prior_only = prior_only,    
     y = y,
     y_int = y_int,
-    trials = rep(0, length(y)),
+    trials = trials,
     n = n,
     input_offset = offset,
     has_re = has_re,
@@ -331,24 +344,20 @@ stan_glm <- function(formula,
     )
     ## ADD MISSING/OBSERVED INDICES -------------  
     standata <- c(y_index_list, standata)
-    ## PRIORS -------------  
-    is_student <- family$family == "student_t"
+    ## PRIORS -------------
+    is_student <- family$family == "student_t"    
     priors_made <- make_priors(user_priors = prior,
-                               y = y,
+                               y = y[y_index_list$y_obs_idx],
+                               trials = trials[y_index_list$y_obs_idx],
                                x = x_full,
                                link = family$link,
-                               offset = offset)    
-    standata <- append_priors(standata, priors_made)    
+                               offset = offset[y_index_list$y_obs_idx])    
+    standata <- append_priors(standata, priors_made)
     ## EMPTY PLACEHOLDERS
     standata <- c(standata, empty_icar_data(n), empty_esf_data(n), empty_car_data(), empty_sar_data(n))
     ## ME MODEL -------------  
     me.list <- make_me_data(ME, xraw)
     standata <- c(standata, me.list)  
-    ## INTEGER OUTCOMES -------------    
-    if (family$family == "binomial") {
-        standata$y <- standata$y_int <- y[,1]
-        standata$trials <- y[,1] + y[,2]
-    }
     ## PARAMETERS TO KEEP -------------               
     pars <- c(pars, 'intercept', 'fitted', 'log_lik')
     if (!intercept_only) pars <- c(pars, 'beta')
@@ -391,7 +400,7 @@ stan_glm <- function(formula,
     out$re <- re_list
     out$priors <- priors_made_slim
     out$x_center <- get_x_center(standata, samples)
-    out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me)
+    out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me)    
     if (out$ME$has_me) out$ME <- c(out$ME, ME)
     if (has_re) {
         out$spatial <- data.frame(par = "alpha_re", method = "Exchangeable")
@@ -402,8 +411,8 @@ stan_glm <- function(formula,
         out$C <- as(C, "sparseMatrix")        
         R <- resid(out, summary = FALSE)
         out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
-    }        
+    }
+    out$N <- length( y_index_list$y_obs_idx )
     return(out)
 }
 
-