Skip to content

Commit

Permalink
allow missing y
Browse files Browse the repository at this point in the history
  • Loading branch information
ConnorDonegan committed Mar 26, 2024
1 parent 5552796 commit 7c7bfac
Show file tree
Hide file tree
Showing 82 changed files with 1,621 additions and 2,004 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ cran-comments.md
/doc/
/Meta/
CRAN-RELEASE
CRAN-SUBMISSION
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: geostan
Title: Bayesian Spatial Analysis
Version: 0.5.4
Version: 0.6.0
Date: 2024-03-01
URL: https://connordonegan.github.io/geostan/
BugReports: https://github.com/ConnorDonegan/geostan/issues
Expand Down
8 changes: 7 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# geostan 0.6.0

## New Additions

The model fitting functions (`stan_glm`, `stan_car`, etc.) now allow for missing data in the outcome variable and a new vignette provides the details. This functionality is not available for auto-Gaussian models - that is, CAR and SAR models that have been fit to continuous outcome variables - but is available for all other available models (including eigenvector spatial filtering `stan_esf` models for continuous outcomes, and all models for count outcomes [binomial and Poisson models]).

# geostan 0.5.4

Minor updates to the vignettees and documentation, also re-compiled geostan models using the latest StanHeaders (fixing an error on CRAN).

# geostan 0.5.3

### Minor changes
## Minor changes

The `gamma` function (which is available to help set prior distributions) has been renamed to `geostan::gamma2` to avoid conflict with `base::gamma`.

Expand Down
6 changes: 4 additions & 2 deletions R/convenience-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -1306,9 +1306,11 @@ exp_pars <- function(formula, data, C) {
MCM <- M %*% C %*% M
eigens <- eigen(MCM, symmetric = TRUE)
npos <- sum(eigens$values > 0)
sa <- mc(residuals(lm(formula, data = data)), C)
res <- residuals(lm(formula, data = data))
obs_idx <- which(!is.na(res))
sa <- mc(res[obs_idx], C[obs_idx, obs_idx])
X <- model.matrix(formula, data)
E_sa <- expected_mc(X, C)
E_sa <- expected_mc(X, C[obs_idx, obs_idx])
Sigma_sa <- sqrt( 2 / nlinks )
z_sa <- (sa - E_sa) / Sigma_sa
if (z_sa < -.59) {
Expand Down
4 changes: 2 additions & 2 deletions R/geostan_fit-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#' @method print geostan_fit
#' @rdname print_geostan_fit
print.geostan_fit <- function(x,
probs = c(0.025, 0.25, 0.5, 0.75, 0.975),
probs = c(0.025, 0.2, 0.5, 0.8, 0.975),
digits = 3,
pars = NULL, ...) {
pars <- c(pars, "intercept")
Expand Down Expand Up @@ -61,7 +61,7 @@ print.geostan_fit <- function(x,
cat("Link function: ", x$family$link, "\n")
cat("Residual Moran Coefficient: ", x$diagnostic["Residual_MC"], "\n")
if (!is.na(x$diagnostic["WAIC"])) cat("WAIC: ", x$diagnostic["WAIC"], "\n")
cat("Observations: ", nrow(x$data), "\n")
cat("Observations: ", x$N, "\n")
cat("Data models (ME): ")
if (x$ME$has_me) {
cat(paste(names(x$ME$se), sep = ", "))
Expand Down
29 changes: 20 additions & 9 deletions R/internal-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ SLX <- function(f, DF, x, W) {
#' @param y_mis_idx Index of censored counts in the outcome, if any.
make_data <- function(frame, x, y_mis_idx) {
y <- model.response(frame)
if (length(y_mis_idx) > 0) y[y_mis_idx] <- NA
offset <- model.offset(frame)
if (is.null(offset)) return(cbind(y, x))
return(cbind(y, offset, x))
Expand Down Expand Up @@ -127,25 +126,37 @@ inv_logit <- function(x) exp(x)/(1 + exp(x))
#'
#' @importFrom stats sd
#' @noRd
make_priors <- function(user_priors = NULL, y, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) {
make_priors <- function(user_priors = NULL, y, trials, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) {
link <- match.arg(link)
if (link == "identity") {
scale.y <- sd(y)
y <- na.omit(y)
scale.y <- max(sd(y), .1)
alpha_scale <- max(4 * sd(y), 5)
alpha_mean <- mean(y)
}
if (link == "log") {
if (any(y == 0)) y[which(y == 0)] <- 1 # local assignment only, not returned
y <- log(y / exp(offset))
if (any(is.infinite(y))) {
any_inf <- which(is.infinite(y))
y[any_inf] <- NA
}
y <- na.omit(y)
alpha_mean <- mean(y)
scale.y <- sd(y)
scale.y <- max(sd(y), .1, na.rm = T)
alpha_scale <- max(4 * scale.y, 5)
}
if (link == "logit") {
y <- y[,1] / (y[,1] + y[,2])
alpha_mean <- 0
scale.y <- sd(y)
alpha_scale <- 5
if (link == "logit") {
p <- y / trials
y <- log( p / (1-p))
if (any(is.infinite(y))) {
any_inf <- which(is.infinite(y))
y[any_inf] <- NA
}
y <- na.omit(y)
alpha_mean <- mean(y)
scale.y <- max(sd(y), 0.1)
alpha_scale <- max(4 * scale.y, 5)
}
priors <- list()
priors$intercept <- normal(location = alpha_mean, scale = alpha_scale)
Expand Down
13 changes: 5 additions & 8 deletions R/prep-censored-data.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@








#' @description Return index of observed, censored y; elsewhere, use results to replace NAs with zeros
#' This will stop if there are missing values and the censor_point argument is not being used; outside of this call, must check that censor_point argument is only used with Poisson likelihood.
#'
Expand All @@ -14,10 +9,12 @@
#' @noRd
#'
handle_censored_y <- function(censor, frame) {
y_raw <- as.numeric(model.response(frame))
y_raw <- model.response(frame)
if (inherits(y_raw, "matrix")) {
y_raw <- y_raw[,1] + y_raw[,2]
}
y_mis_idx <- which(is.na(y_raw))
y_obs_idx <- which(!is.na(y_raw))
if (censor == FALSE & length(y_mis_idx) > 0) stop("Missing values in y. If these are censored counts, you may want to use the censor_point argument.")
return (list(n_mis = length(y_mis_idx),
n_obs = length(y_obs_idx),
y_mis_idx = y_mis_idx,
Expand All @@ -29,5 +26,5 @@ handle_censored_y <- function(censor, frame) {
#' @noRd
handle_missing_x <- function(frame) {
check_NAs <- apply(as.matrix(frame[,-1]), 2, function(c) any(is.na(c)))
if (any(check_NAs)) stop("Missing values found in covariates or offset term. Even when using the censor_point argument, missing values are only permitted in the outcome variable.")
if (any(check_NAs)) stop("Missing values found in covariates or offset term. Missing values are only permitted in the outcome variable.")
}
39 changes: 24 additions & 15 deletions R/stan_car.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ stan_car <- function(formula,
if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "CAR")
stopifnot(!missing(data))
check_car_parts(car_parts)
stopifnot(length(car_parts$Delta_inv) == nrow(data))
stopifnot(car_parts$n == nrow(data))
if (!missing(C)) {
stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
stopifnot(all(dim(C) == nrow(data)))
Expand All @@ -267,13 +267,25 @@ stan_car <- function(formula,
if (missing(censor_point)) censor_point <- FALSE
mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
handle_missing_x(mod_frame)
## prep Y for missingness //
y_index_list <- handle_censored_y(censor_point, mod_frame)
y <- y_int <- model.response(mod_frame)
## CAR: INCLUDE AUTO-GAUSSIAN AS family_int=5 -------------
if (family_int %in% c(1,2,5)) y_int <- rep(0, length(y))
## -------------
y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
mod_frame[y_index_list$y_mis_idx, 1] <- 0
mi_idx <- y_index_list$y_mis_idx
y_tmp <- model.response(mod_frame)
if (family$family == "binomial") {
trials <- y_tmp[,1] + y_tmp[,2]
y <- y_int <- y_tmp[,1]
}
if (family$family == "poisson") {
y <- y_int <- y_tmp
trials <- rep(0, n)
}
if (family$family %in% c("gaussian", "auto_gaussian")) {
y <- y_tmp
y_int <- trials <- rep(0, n)
if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.")
}
y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
## //
if (is.null(model.offset(mod_frame))) {
offset <- rep(0, times = n)
} else {
Expand Down Expand Up @@ -339,7 +351,7 @@ stan_car <- function(formula,
prior_only = prior_only,
y = y,
y_int = y_int,
trials = rep(0, length(y)),
trials = trials,
#n = n, # getting n from car_parts, below
input_offset = offset,
has_re = has_re,
Expand All @@ -359,10 +371,11 @@ stan_car <- function(formula,
## PRIORS & CAR DATA -------------
is_student <- FALSE ##family$family == "student_t"
priors_made <- make_priors(user_priors = prior,
y = y,
y = y[y_index_list$y_obs_idx],
trials = trials[y_index_list$y_obs_idx],
x = x_full,
link = family$link,
offset = offset)
offset = offset[y_index_list$y_obs_idx])
if (is.null(prior$car_scale)) {
priors_made$car_scale <- priors_made$sigma
} else {
Expand Down Expand Up @@ -390,11 +403,6 @@ stan_car <- function(formula,
# append me.list to standata
standata <- c(standata, me.list)

## INTEGER OUTCOMES -------------
if (family$family == "binomial") {
standata$y <- standata$y_int <- y[,1]
standata$trials <- y[,1] + y[,2]
}
## PARAMETERS TO KEEP, with CAR PARAMETERS [START] -------------
pars <- c(pars, 'intercept', 'car_scale', 'car_rho', 'fitted', 'log_lik')
if (family_int < 5) pars <- c(pars, 'log_lambda_mu')
Expand Down Expand Up @@ -451,6 +459,7 @@ stan_car <- function(formula,
R <- resid(out, summary = FALSE)
out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
}
out$N <- length( y_index_list$y_obs_idx )
return (out)
}

34 changes: 22 additions & 12 deletions R/stan_esf.R
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,24 @@ stan_esf <- function(formula,
if (missing(censor_point)) censor_point <- FALSE
mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
handle_missing_x(mod_frame)
## prep Y for missingness //
y_index_list <- handle_censored_y(censor_point, mod_frame)
y <- y_int <- model.response(mod_frame)
if (family_int %in% c(1,2)) y_int <- rep(0, length(y))
y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
mod_frame[y_index_list$y_mis_idx, 1] <- 0
mi_idx <- y_index_list$y_mis_idx
y_tmp <- model.response(mod_frame)
if (family$family == "binomial") {
trials <- y_tmp[,1] + y_tmp[,2]
y <- y_int <- y_tmp[,1]
}
if (family$family == "poisson") {
y <- y_int <- y_tmp
trials <- rep(0, n)
}
if (family$family %in% c("gaussian", "student_t")) {
y <- y_tmp
y_int <- trials <- rep(0, n)
}
y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
## //
if (is.null(model.offset(mod_frame))) {
offset <- rep(0, times = n)
} else {
Expand Down Expand Up @@ -289,7 +302,7 @@ stan_esf <- function(formula,
prior_only = prior_only,
y = y,
y_int = y_int,
trials = rep(0, length(y)),
trials = trials,
n = n,
input_offset = offset,
has_re = has_re,
Expand All @@ -316,12 +329,13 @@ stan_esf <- function(formula,
hs_global_scale <- min(1, p0/(dev-p0) / sqrt(n))
}
priors_made <- make_priors(user_priors = prior,
y = y,
y = y[y_index_list$y_obs_idx],
trials = trials[y_index_list$y_obs_idx],
x = x_full,
hs_global_scale = hs_global_scale,
EV = EV,
link = family$link,
offset = offset)
offset = offset[y_index_list$y_obs_idx])
standata <- append_priors(standata, priors_made)
esf_dl <- list(
dev = dev,
Expand All @@ -337,11 +351,6 @@ stan_esf <- function(formula,
## ME MODEL STUFF -------------
me.list <- make_me_data(ME, xraw)
standata <- c(standata, me.list)
## INTEGER OUTCOMES -------------
if (family$family == "binomial") {
standata$y <- standata$y_int <- y[,1]
standata$trials <- y[,1] + y[,2]
}
## PARAMETERS TO KEEP -------------
pars <- c(pars, 'intercept', 'esf', 'beta_ev', 'log_lik', 'fitted')
if (!intercept_only) pars <- c(pars, 'beta')
Expand Down Expand Up @@ -393,6 +402,7 @@ stan_esf <- function(formula,
R <- resid(out, summary = FALSE)
out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
}
out$N <- length( y_index_list$y_obs_idx )
return (out)
}

Expand Down
45 changes: 27 additions & 18 deletions R/stan_glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,24 @@ stan_glm <- function(formula,
if (missing(censor_point)) censor_point <- FALSE
mod_frame <- model.frame(formula, tmpdf, na.action = NULL)
handle_missing_x(mod_frame)
## prep Y for missingness //
y_index_list <- handle_censored_y(censor_point, mod_frame)
y <- y_int <- model.response(mod_frame)
if (family_int %in% c(1,2)) y_int <- rep(0, length(y))
y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0
mod_frame[y_index_list$y_mis_idx, 1] <- 0
mi_idx <- y_index_list$y_mis_idx
y_tmp <- model.response(mod_frame)
if (family$family == "binomial") {
trials <- y_tmp[,1] + y_tmp[,2]
y <- y_int <- y_tmp[,1]
}
if (family$family == "poisson") {
y <- y_int <- y_tmp
trials <- rep(0, n)
}
if (family$family %in% c("gaussian", "student_t")) {
y <- y_tmp
y_int <- trials <- rep(0, n)
}
y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0
## //
if (is.null(model.offset(mod_frame))) {
offset <- rep(0, times = n)
} else {
Expand Down Expand Up @@ -314,7 +327,7 @@ stan_glm <- function(formula,
prior_only = prior_only,
y = y,
y_int = y_int,
trials = rep(0, length(y)),
trials = trials,
n = n,
input_offset = offset,
has_re = has_re,
Expand All @@ -331,24 +344,20 @@ stan_glm <- function(formula,
)
## ADD MISSING/OBSERVED INDICES -------------
standata <- c(y_index_list, standata)
## PRIORS -------------
is_student <- family$family == "student_t"
## PRIORS -------------
is_student <- family$family == "student_t"
priors_made <- make_priors(user_priors = prior,
y = y,
y = y[y_index_list$y_obs_idx],
trials = trials[y_index_list$y_obs_idx],
x = x_full,
link = family$link,
offset = offset)
standata <- append_priors(standata, priors_made)
offset = offset[y_index_list$y_obs_idx])
standata <- append_priors(standata, priors_made)
## EMPTY PLACEHOLDERS
standata <- c(standata, empty_icar_data(n), empty_esf_data(n), empty_car_data(), empty_sar_data(n))
## ME MODEL -------------
me.list <- make_me_data(ME, xraw)
standata <- c(standata, me.list)
## INTEGER OUTCOMES -------------
if (family$family == "binomial") {
standata$y <- standata$y_int <- y[,1]
standata$trials <- y[,1] + y[,2]
}
## PARAMETERS TO KEEP -------------
pars <- c(pars, 'intercept', 'fitted', 'log_lik')
if (!intercept_only) pars <- c(pars, 'beta')
Expand Down Expand Up @@ -391,7 +400,7 @@ stan_glm <- function(formula,
out$re <- re_list
out$priors <- priors_made_slim
out$x_center <- get_x_center(standata, samples)
out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me)
out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me)
if (out$ME$has_me) out$ME <- c(out$ME, ME)
if (has_re) {
out$spatial <- data.frame(par = "alpha_re", method = "Exchangeable")
Expand All @@ -402,8 +411,8 @@ stan_glm <- function(formula,
out$C <- as(C, "sparseMatrix")
R <- resid(out, summary = FALSE)
out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) )
}
}
out$N <- length( y_index_list$y_obs_idx )
return(out)
}


Loading

0 comments on commit 7c7bfac

Please sign in to comment.