diff --git a/.gitignore b/.gitignore index a5c47354..43f75f04 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ cran-comments.md /doc/ /Meta/ CRAN-RELEASE +CRAN-SUBMISSION diff --git a/DESCRIPTION b/DESCRIPTION index 197ff1f6..834934ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: geostan Title: Bayesian Spatial Analysis -Version: 0.5.4 +Version: 0.6.0 Date: 2024-03-01 URL: https://connordonegan.github.io/geostan/ BugReports: https://github.com/ConnorDonegan/geostan/issues diff --git a/NEWS.md b/NEWS.md index 3a13e1ae..bd168f6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,16 @@ +# geostan 0.6.0 + +## New Additions + +The model fitting functions (`stan_glm`, `stan_car`, etc.) now allow for missing data in the outcome variable and a new vignette provides the details. This functionality is not available for auto-Gaussian models - that is, CAR and SAR models that have been fit to continuous outcome variables - but is available for all other available models (including eigenvector spatial filtering `stan_esf` models for continuous outcomes, and all models for count outcomes [binomial and Poisson models]). + # geostan 0.5.4 Minor updates to the vignettees and documentation, also re-compiled geostan models using the latest StanHeaders (fixing an error on CRAN). # geostan 0.5.3 -### Minor changes +## Minor changes The `gamma` function (which is available to help set prior distributions) has been renamed to `geostan::gamma2` to avoid conflict with `base::gamma`. diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 5448c614..05d12278 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -1306,9 +1306,11 @@ exp_pars <- function(formula, data, C) { MCM <- M %*% C %*% M eigens <- eigen(MCM, symmetric = TRUE) npos <- sum(eigens$values > 0) - sa <- mc(residuals(lm(formula, data = data)), C) + res <- residuals(lm(formula, data = data)) + obs_idx <- which(!is.na(res)) + sa <- mc(res[obs_idx], C[obs_idx, obs_idx]) X <- model.matrix(formula, data) - E_sa <- expected_mc(X, C) + E_sa <- expected_mc(X, C[obs_idx, obs_idx]) Sigma_sa <- sqrt( 2 / nlinks ) z_sa <- (sa - E_sa) / Sigma_sa if (z_sa < -.59) { diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R index 5bc972da..136947f5 100644 --- a/R/geostan_fit-methods.R +++ b/R/geostan_fit-methods.R @@ -32,7 +32,7 @@ #' @method print geostan_fit #' @rdname print_geostan_fit print.geostan_fit <- function(x, - probs = c(0.025, 0.25, 0.5, 0.75, 0.975), + probs = c(0.025, 0.2, 0.5, 0.8, 0.975), digits = 3, pars = NULL, ...) { pars <- c(pars, "intercept") @@ -61,7 +61,7 @@ print.geostan_fit <- function(x, cat("Link function: ", x$family$link, "\n") cat("Residual Moran Coefficient: ", x$diagnostic["Residual_MC"], "\n") if (!is.na(x$diagnostic["WAIC"])) cat("WAIC: ", x$diagnostic["WAIC"], "\n") - cat("Observations: ", nrow(x$data), "\n") + cat("Observations: ", x$N, "\n") cat("Data models (ME): ") if (x$ME$has_me) { cat(paste(names(x$ME$se), sep = ", ")) diff --git a/R/internal-functions.R b/R/internal-functions.R index e7dfa8fb..7de40761 100644 --- a/R/internal-functions.R +++ b/R/internal-functions.R @@ -79,7 +79,6 @@ SLX <- function(f, DF, x, W) { #' @param y_mis_idx Index of censored counts in the outcome, if any. make_data <- function(frame, x, y_mis_idx) { y <- model.response(frame) - if (length(y_mis_idx) > 0) y[y_mis_idx] <- NA offset <- model.offset(frame) if (is.null(offset)) return(cbind(y, x)) return(cbind(y, offset, x)) @@ -127,25 +126,37 @@ inv_logit <- function(x) exp(x)/(1 + exp(x)) #' #' @importFrom stats sd #' @noRd -make_priors <- function(user_priors = NULL, y, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) { +make_priors <- function(user_priors = NULL, y, trials, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) { link <- match.arg(link) if (link == "identity") { - scale.y <- sd(y) + y <- na.omit(y) + scale.y <- max(sd(y), .1) alpha_scale <- max(4 * sd(y), 5) alpha_mean <- mean(y) } if (link == "log") { if (any(y == 0)) y[which(y == 0)] <- 1 # local assignment only, not returned y <- log(y / exp(offset)) + if (any(is.infinite(y))) { + any_inf <- which(is.infinite(y)) + y[any_inf] <- NA + } + y <- na.omit(y) alpha_mean <- mean(y) - scale.y <- sd(y) + scale.y <- max(sd(y), .1, na.rm = T) alpha_scale <- max(4 * scale.y, 5) } - if (link == "logit") { - y <- y[,1] / (y[,1] + y[,2]) - alpha_mean <- 0 - scale.y <- sd(y) - alpha_scale <- 5 + if (link == "logit") { + p <- y / trials + y <- log( p / (1-p)) + if (any(is.infinite(y))) { + any_inf <- which(is.infinite(y)) + y[any_inf] <- NA + } + y <- na.omit(y) + alpha_mean <- mean(y) + scale.y <- max(sd(y), 0.1) + alpha_scale <- max(4 * scale.y, 5) } priors <- list() priors$intercept <- normal(location = alpha_mean, scale = alpha_scale) diff --git a/R/prep-censored-data.R b/R/prep-censored-data.R index ea0cb6d4..5143e48a 100644 --- a/R/prep-censored-data.R +++ b/R/prep-censored-data.R @@ -1,11 +1,6 @@ - - - - - #' @description Return index of observed, censored y; elsewhere, use results to replace NAs with zeros #' This will stop if there are missing values and the censor_point argument is not being used; outside of this call, must check that censor_point argument is only used with Poisson likelihood. #' @@ -14,10 +9,12 @@ #' @noRd #' handle_censored_y <- function(censor, frame) { - y_raw <- as.numeric(model.response(frame)) + y_raw <- model.response(frame) + if (inherits(y_raw, "matrix")) { + y_raw <- y_raw[,1] + y_raw[,2] + } y_mis_idx <- which(is.na(y_raw)) y_obs_idx <- which(!is.na(y_raw)) - if (censor == FALSE & length(y_mis_idx) > 0) stop("Missing values in y. If these are censored counts, you may want to use the censor_point argument.") return (list(n_mis = length(y_mis_idx), n_obs = length(y_obs_idx), y_mis_idx = y_mis_idx, @@ -29,5 +26,5 @@ handle_censored_y <- function(censor, frame) { #' @noRd handle_missing_x <- function(frame) { check_NAs <- apply(as.matrix(frame[,-1]), 2, function(c) any(is.na(c))) - if (any(check_NAs)) stop("Missing values found in covariates or offset term. Even when using the censor_point argument, missing values are only permitted in the outcome variable.") + if (any(check_NAs)) stop("Missing values found in covariates or offset term. Missing values are only permitted in the outcome variable.") } diff --git a/R/stan_car.R b/R/stan_car.R index 0d876bf3..90e3b775 100644 --- a/R/stan_car.R +++ b/R/stan_car.R @@ -250,7 +250,7 @@ stan_car <- function(formula, if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "CAR") stopifnot(!missing(data)) check_car_parts(car_parts) - stopifnot(length(car_parts$Delta_inv) == nrow(data)) + stopifnot(car_parts$n == nrow(data)) if (!missing(C)) { stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) stopifnot(all(dim(C) == nrow(data))) @@ -267,13 +267,25 @@ stan_car <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - ## CAR: INCLUDE AUTO-GAUSSIAN AS family_int=5 ------------- - if (family_int %in% c(1,2,5)) y_int <- rep(0, length(y)) - ## ------------- - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "auto_gaussian")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.") + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -339,7 +351,7 @@ stan_car <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, #n = n, # getting n from car_parts, below input_offset = offset, has_re = has_re, @@ -359,10 +371,11 @@ stan_car <- function(formula, ## PRIORS & CAR DATA ------------- is_student <- FALSE ##family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) if (is.null(prior$car_scale)) { priors_made$car_scale <- priors_made$sigma } else { @@ -390,11 +403,6 @@ stan_car <- function(formula, # append me.list to standata standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP, with CAR PARAMETERS [START] ------------- pars <- c(pars, 'intercept', 'car_scale', 'car_rho', 'fitted', 'log_lik') if (family_int < 5) pars <- c(pars, 'log_lambda_mu') @@ -451,6 +459,7 @@ stan_car <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_esf.R b/R/stan_esf.R index b517b579..00f2a507 100644 --- a/R/stan_esf.R +++ b/R/stan_esf.R @@ -229,11 +229,24 @@ stan_esf <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -289,7 +302,7 @@ stan_esf <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -316,12 +329,13 @@ stan_esf <- function(formula, hs_global_scale <- min(1, p0/(dev-p0) / sqrt(n)) } priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, hs_global_scale = hs_global_scale, EV = EV, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) standata <- append_priors(standata, priors_made) esf_dl <- list( dev = dev, @@ -337,11 +351,6 @@ stan_esf <- function(formula, ## ME MODEL STUFF ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP ------------- pars <- c(pars, 'intercept', 'esf', 'beta_ev', 'log_lik', 'fitted') if (!intercept_only) pars <- c(pars, 'beta') @@ -393,6 +402,7 @@ stan_esf <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_glm.R b/R/stan_glm.R index 221cd8ba..6baffb3f 100644 --- a/R/stan_glm.R +++ b/R/stan_glm.R @@ -244,11 +244,24 @@ stan_glm <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -314,7 +327,7 @@ stan_glm <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -331,24 +344,20 @@ stan_glm <- function(formula, ) ## ADD MISSING/OBSERVED INDICES ------------- standata <- c(y_index_list, standata) - ## PRIORS ------------- - is_student <- family$family == "student_t" + ## PRIORS ------------- + is_student <- family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) - standata <- append_priors(standata, priors_made) + offset = offset[y_index_list$y_obs_idx]) + standata <- append_priors(standata, priors_made) ## EMPTY PLACEHOLDERS standata <- c(standata, empty_icar_data(n), empty_esf_data(n), empty_car_data(), empty_sar_data(n)) ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP ------------- pars <- c(pars, 'intercept', 'fitted', 'log_lik') if (!intercept_only) pars <- c(pars, 'beta') @@ -391,7 +400,7 @@ stan_glm <- function(formula, out$re <- re_list out$priors <- priors_made_slim out$x_center <- get_x_center(standata, samples) - out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me) + out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me) if (out$ME$has_me) out$ME <- c(out$ME, ME) if (has_re) { out$spatial <- data.frame(par = "alpha_re", method = "Exchangeable") @@ -402,8 +411,8 @@ stan_glm <- function(formula, out$C <- as(C, "sparseMatrix") R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) - } + } + out$N <- length( y_index_list$y_obs_idx ) return(out) } - diff --git a/R/stan_icar.R b/R/stan_icar.R index c5a5ec14..52bf7503 100644 --- a/R/stan_icar.R +++ b/R/stan_icar.R @@ -286,11 +286,24 @@ stan_icar <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -346,7 +359,7 @@ stan_icar <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -363,13 +376,14 @@ stan_icar <- function(formula, ) ## ADD MISSING/OBSERVED INDICES ------------- standata <- c(y_index_list, standata) - ## PRIORS ------------- + ## PRIORS ------------- is_student <- family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) standata <- append_priors(standata, priors_made) ## ICAR DATA [START] ------------- iar.list <- prep_icar_data(C, scale_factor = scale_factor) @@ -382,11 +396,6 @@ stan_icar <- function(formula, ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP with ICAR [START] ------------- pars <- c(pars, 'intercept', 'spatial_scale', 'fitted', 'phi', 'log_lik') if (type == "bym2") pars <- c(pars, 'theta', 'rho') @@ -442,6 +451,7 @@ stan_icar <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_sar.R b/R/stan_sar.R index 99a6fa6b..d4663bb8 100644 --- a/R/stan_sar.R +++ b/R/stan_sar.R @@ -247,6 +247,7 @@ stan_sar <- function(formula, if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "SAR") stopifnot(!missing(data)) check_sar_parts(sar_parts) + stopifnot(sar_parts$n == nrow(data)) if (!missing(C)) { stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) stopifnot(all(dim(C) == nrow(data))) @@ -260,13 +261,25 @@ stan_sar <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - ## SAR: INCLUDE AUTO-GAUSSIAN AS family_int=6 ------------- - if (family_int %in% c(1,2,6)) y_int <- rep(0, length(y)) - ## ------------- - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "auto_gaussian")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.") + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -332,7 +345,7 @@ stan_sar <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, #n = n, # getting n from sar_parts, below input_offset = offset, has_re = has_re, @@ -352,10 +365,11 @@ stan_sar <- function(formula, ## PRIORS & SAR DATA ------------- is_student <- FALSE ##family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) if (is.null(prior$sar_scale)) { priors_made$sar_scale <- priors_made$sigma } else { @@ -375,11 +389,6 @@ stan_sar <- function(formula, ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP, with SAR PARAMETERS [START] ------------- pars <- c(pars, 'intercept', 'sar_scale', 'sar_rho', 'fitted', 'log_lik') if (family_int < 6) pars <- c(pars, 'log_lambda_mu') @@ -433,7 +442,8 @@ stan_sar <- function(formula, out$C <- as(C, "sparseMatrix") R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) - } + } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/README.Rmd b/README.Rmd index dab402ef..5fb011b7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -20,7 +20,7 @@ knitr::opts_chunk$set( ## geostan: Bayesian spatial analysis -The [**geostan**](https://connordonegan.github.io/geostan/) R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) on spatial autocorrelation, spatial measurement error models, and spatial regression with raster layers. +The [**geostan**](https://connordonegan.github.io/geostan/) R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) on spatial autocorrelation, spatial measurement error models, spatial regression with raster layers, and building custom spatial model in Stan. The package is particularly suitable for public health research with spatial data, and complements the [**surveil**](https://connordonegan.github.io/surveil/) R package for time series analysis of public health surveillance data. diff --git a/README.html b/README.html deleted file mode 100644 index 1561109b..00000000 --- a/README.html +++ /dev/null @@ -1,702 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -

geostan: Bayesian spatial analysis

-

The geostan R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package help pages and vignettes on spatial autocorrelation, spatial measurement error models, and spatial regression with raster layers.

-

The package is particularly suitable for public health research with spatial data, and complements the surveil R package for time series analysis of public health surveillance data.

-

geostan models were built using Stan, a state-of-the-art platform for Bayesian modeling.

-

DOI

-

Disease mapping and spatial regression

-

Statistical models for data recorded across areal units like states, counties, or census tracts.

-

Observational uncertainty

-

Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model.

-

Censored observations

-

Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk with censored observations.

-

Spatial analysis tools

-

Tools for visualizing and measuring spatial autocorrelation and map patterns, for exploratory analysis and model diagnostics.

-

The RStan ecosystem

-

Interfaces easily with many high-quality R packages for Bayesian modeling.

-

Custom spatial models

-

Tools for building custom spatial models in Stan.

-

Installation

-

Install geostan from CRAN using:

-
install.packages("geostan")
-

Support

-

All functions and methods are documented (with examples) on the website reference page. See the package vignettes for more on exploratory spatial data analysis, spatial measurement error models, and spatial regression with large raster layers.

-

To ask questions, report a bug, or discuss ideas for improvements or new features please visit the issues page, start a discussion, or submit a pull request.

-

Usage

-

Load the package and the georgia county mortality data set (ages 55-64, years 2014-2018):

-
library(geostan)
-data(georgia)
-

The sp_diag function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map:

-
A <- shape2mat(georgia, style = "B")
-sp_diag(georgia$rate.female, georgia, w = A)
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
-#> Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
- - -

There are three censored observations in the georgia female mortality data, which means there were 9 or fewer deaths in those counties. The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. By using the censor_point argument we include our information on the censored observations to obtain results for all counties:

-
cars <- prep_car_data(A)
-#> Range of permissible rho values:  -1.661134 1
-fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
-                censor_point = 9,
-        data = georgia,
-        car_parts = cars,
-        family = poisson(),
-        cores = 4, # for multi-core processing
-        refresh = 0) # to silence some printing
-#> 
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#>   location scale
-#> 1     -4.7     5
-#> 
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#>   df location scale
-#> 1 10        0     3
-#> 
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#>   lower upper
-#> 1  -1.7     1
-

Passing a fitted model to the sp_diag function will return a set of diagnostics for spatial models:

-
sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
-#> Warning: Removed 3 rows containing missing values
-#> (`geom_pointrange()`).
- - -

The print method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean, effective sample size n_eff, and the R-hat statistic Rhat):

-
print(fit)
-#> Spatial Model Results 
-#> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome):  CAR 
-#> Likelihood function:  poisson 
-#> Link function:  log 
-#> Residual Moran Coefficient:  0.0021755 
-#> WAIC:  1291.91 
-#> Observations:  159 
-#> Data models (ME): none
-#> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1; 
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
-#> 
-#>             mean se_mean    sd   2.5%    25%    50%    75%  97.5% n_eff  Rhat
-#> intercept -4.673   0.002 0.093 -4.843 -4.716 -4.674 -4.630 -4.497  1636 1.000
-#> car_rho    0.922   0.001 0.058  0.778  0.893  0.935  0.967  0.995  3214 1.001
-#> car_scale  0.458   0.001 0.035  0.393  0.433  0.455  0.481  0.532  3867 1.000
-#> 
-#> Samples were drawn using NUTS(diag_e) at Wed Oct  4 12:20:45 2023.
-#> For each parameter, n_eff is a crude measure of effective sample size,
-#> and Rhat is the potential scale reduction factor on split chains (at 
-#> convergence, Rhat=1).
-

More demonstrations can be found in the package help pages and vignettes.

- - - diff --git a/README.md b/README.md index 2ab76a71..775418be 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,9 @@ and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) -on spatial autocorrelation, spatial measurement error models, and -spatial regression with raster layers. +on spatial autocorrelation, spatial measurement error models, spatial +regression with raster layers, and building custom spatial model in +Stan. The package is particularly suitable for public health research with spatial data, and complements the @@ -86,6 +87,7 @@ Load the package and the `georgia` county mortality data set (ages ``` r library(geostan) +#> This is geostan version 0.5.3 data(georgia) ``` @@ -95,7 +97,7 @@ including a histogram, Moran scatter plot, and map: ``` r A <- shape2mat(georgia, style = "B") sp_diag(georgia$rate.female, georgia, w = A) -#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat. +#> 3 NA values found in x will be dropped from data x and matrix w #> Warning: Removed 3 rows containing non-finite values (`stat_bin()`). ``` @@ -133,6 +135,12 @@ fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)), #> Distribution: uniform #> lower upper #> 1 -1.7 1 +#> Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable. +#> Running the chains for more iterations may help. See +#> https://mc-stan.org/misc/warnings.html#bulk-ess +#> Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable. +#> Running the chains for more iterations may help. See +#> https://mc-stan.org/misc/warnings.html#tail-ess ``` Passing a fitted model to the `sp_diag` function will return a set of @@ -141,9 +149,8 @@ diagnostics for spatial models: ``` r sp_diag(fit, georgia, w = A) #> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE. -#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat. -#> Warning: Removed 3 rows containing missing values -#> (`geom_pointrange()`). +#> 3 NA values found in x will be dropped from data x and matrix w +#> Warning: Removed 3 rows containing missing values (`geom_pointrange()`). ``` @@ -161,8 +168,8 @@ print(fit) #> Spatial method (outcome): CAR #> Likelihood function: poisson #> Link function: log -#> Residual Moran Coefficient: 0.0021755 -#> WAIC: 1291.91 +#> Residual Moran Coefficient: -0.0004015 +#> WAIC: 1290.5 #> Observations: 159 #> Data models (ME): none #> Inference for Stan model: foundation. @@ -170,11 +177,11 @@ print(fit) #> post-warmup draws per chain=1000, total post-warmup draws=4000. #> #> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat -#> intercept -4.673 0.002 0.093 -4.843 -4.716 -4.674 -4.630 -4.497 1636 1.000 -#> car_rho 0.922 0.001 0.058 0.778 0.893 0.935 0.967 0.995 3214 1.001 -#> car_scale 0.458 0.001 0.035 0.393 0.433 0.455 0.481 0.532 3867 1.000 +#> intercept -4.620 0.055 0.418 -4.851 -4.719 -4.674 -4.627 -4.334 57 1.074 +#> car_rho 0.926 0.001 0.057 0.783 0.896 0.937 0.968 0.999 1494 1.002 +#> car_scale 0.457 0.001 0.035 0.394 0.433 0.455 0.479 0.529 3559 1.000 #> -#> Samples were drawn using NUTS(diag_e) at Wed Oct 4 12:20:45 2023. +#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:14:50 2024. #> For each parameter, n_eff is a crude measure of effective sample size, #> and Rhat is the potential scale reduction factor on split chains (at #> convergence, Rhat=1). diff --git a/cran-comments.md~ b/cran-comments.md~ deleted file mode 100644 index 88d6be0e..00000000 --- a/cran-comments.md~ +++ /dev/null @@ -1,3 +0,0 @@ - - -Package was archived on CRAN due to gcc-UBSAN issues which stem from StanHeaders. StanHeaders has since been updated to fix this issue. diff --git a/docs/404.html b/docs/404.html index 7d80cb4f..f40cf104 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ geostan - 0.5.4 + 0.6.0 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index dfd4bb61..99c4be3b 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 9298d5e7..665d0f87 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 @@ -50,6 +50,7 @@

GNU General Public License

+

Version 3, 29 June 2007
Copyright © 2007 Free Software Foundation, Inc. <http://fsf.org/>

Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.

@@ -68,7 +69,7 @@

Preamble

TERMS AND CONDITIONS

-

0. Definitions

+

0. Definitions

“This License” refers to version 3 of the GNU General Public License.

“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.

“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.

@@ -79,7 +80,7 @@

0. DefinitionsAn interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.

-

1. Source Code

+

1. Source Code

The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.

A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.

The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.

@@ -88,23 +89,23 @@

1. Source CodeThe Corresponding Source for a work in source code form is that same work.

-

2. Basic Permissions

+

2. Basic Permissions

All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.

You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.

Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.

- +

No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.

When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work’s users, your or third parties’ legal rights to forbid circumvention of technological measures.

-

4. Conveying Verbatim Copies

+

4. Conveying Verbatim Copies

You may convey verbatim copies of the Program’s source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.

You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.

-

5. Conveying Modified Source Versions

+

5. Conveying Modified Source Versions

You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:

  • a) The work must carry prominent notices stating that you modified it, and giving a relevant date.
  • @@ -117,7 +118,7 @@

    5. Conveying Modified Source Versi

A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation’s users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.

-

6. Conveying Non-Source Forms

+

6. Conveying Non-Source Forms

You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:

  • a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.
  • @@ -137,7 +138,7 @@

    6. Conveying Non-Source Forms -

    7. Additional Terms

    +

    7. Additional Terms

    “Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.

    When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.

    Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:

    @@ -158,24 +159,24 @@

    7. Additional Terms -

    8. Termination

    +

    8. Termination

    You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).

    However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.

    Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.

    Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.

-

9. Acceptance Not Required for Having Copies

+

9. Acceptance Not Required for Having Copies

You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.

-

10. Automatic Licensing of Downstream Recipients

+

10. Automatic Licensing of Downstream Recipients

Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.

An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party’s predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.

You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.

-

11. Patents

+

11. Patents

A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor’s “contributor version”.

A contributor’s “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.

Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor’s essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.

@@ -186,30 +187,30 @@

11. Patents -

12. No Surrender of Others’ Freedom

+

12. No Surrender of Others’ Freedom

If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.

-

13. Use with the GNU Affero General Public License

+

13. Use with the GNU Affero General Public License

Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.

-

14. Revised Versions of this License

+

14. Revised Versions of this License

The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.

Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.

If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy’s public statement of acceptance of a version permanently authorizes you to choose that version for the Program.

Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.

-

15. Disclaimer of Warranty

+

15. Disclaimer of Warranty

THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

-

16. Limitation of Liability

+

16. Limitation of Liability

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

-

17. Interpretation of Sections 15 and 16

+

17. Interpretation of Sections 15 and 16

If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.

END OF TERMS AND CONDITIONS

@@ -218,27 +219,27 @@

17. Interpretation of Sections

How to Apply These Terms to Your New Programs

If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.

To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found.

-
<one line to give the program's name and a brief idea of what it does.>
-Copyright (C) <year>  <name of author>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
<one line to give the program's name and a brief idea of what it does.>
+Copyright (C) <year>  <name of author>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:

-
<program>  Copyright (C) <year>  <name of author>
-This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
-This is free software, and you are welcome to redistribute it
-under certain conditions; type 'show c' for details.
+
<program>  Copyright (C) <year>  <name of author>
+This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
+This is free software, and you are welcome to redistribute it
+under certain conditions; type 'show c' for details.

The hypothetical commands show w and show c should show the appropriate parts of the General Public License. Of course, your program’s commands might be different; for a GUI interface, you would use an “about box”.

You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <http://www.gnu.org/licenses/>.

The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.

diff --git a/docs/articles/custom-spatial-models.html b/docs/articles/custom-spatial-models.html index 999e4e46..1825b657 100644 --- a/docs/articles/custom-spatial-models.html +++ b/docs/articles/custom-spatial-models.html @@ -40,7 +40,7 @@ geostan - 0.5.4 + 0.6.0
@@ -73,11 +73,13 @@ -
+
@@ -97,14 +97,14 @@

Arguments

Details

When rates = FALSE and the model is Poisson or Binomial, the fitted values returned by the fitted method are the expected value of the response variable. The rates argument is used to translate count outcomes to rates by dividing by the appropriate denominator. The behavior of the rates argument depends on the model specification. Consider a Poisson model of disease incidence, such as the following intercept-only case:

-

fit <- stan_glm(y ~ offset(log(E)),
-               data = data,
-               family = poisson())

+

fit <- stan_glm(y ~ offset(log(E)),
+               data = data,
+               family = poisson())

If the fitted values are extracted using rates = FALSE, then fitted(fit) will return the expectation of \(y\). If rates = TRUE (the default), then fitted(fit) will return the expected value of the rate \(\frac{y}{E}\).

If a binomial model is used instead of the Poisson, then using rates = TRUE will return the expectation of \(\frac{y}{N}\) where \(N\) is the sum of the number of 'successes' and 'failures', as in:

-

fit <- stan_glm(cbind(successes, failures) ~ 1,
-               data = data,
-               family = binomial())

+

fit <- stan_glm(cbind(successes, failures) ~ 1,
+               data = data,
+               family = binomial())

diff --git a/docs/reference/row_standardize.html b/docs/reference/row_standardize.html index b88c9dd7..1067c486 100644 --- a/docs/reference/row_standardize.html +++ b/docs/reference/row_standardize.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0
diff --git a/docs/reference/samples_geostan_fit.html b/docs/reference/samples_geostan_fit.html index 4a0089f1..1dc5459a 100644 --- a/docs/reference/samples_geostan_fit.html +++ b/docs/reference/samples_geostan_fit.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0
diff --git a/docs/reference/se_log.html b/docs/reference/se_log.html index cad91cd8..771180ff 100644 --- a/docs/reference/se_log.html +++ b/docs/reference/se_log.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0
diff --git a/docs/reference/sentencing.html b/docs/reference/sentencing.html index b694b4a0..bcdc98d3 100644 --- a/docs/reference/sentencing.html +++ b/docs/reference/sentencing.html @@ -18,7 +18,7 @@ geostan - 0.5.4 + 0.6.0
diff --git a/docs/reference/shape2mat.html b/docs/reference/shape2mat.html index 3d45fe40..b058e336 100644 --- a/docs/reference/shape2mat.html +++ b/docs/reference/shape2mat.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0
diff --git a/docs/reference/sim_sar.html b/docs/reference/sim_sar.html index 97b6c0a7..d60bf599 100644 --- a/docs/reference/sim_sar.html +++ b/docs/reference/sim_sar.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 diff --git a/docs/reference/sp_diag.html b/docs/reference/sp_diag.html index 4013a256..6aa8b6bf 100644 --- a/docs/reference/sp_diag.html +++ b/docs/reference/sp_diag.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 diff --git a/docs/reference/stan_car.html b/docs/reference/stan_car.html index 53fa676e..ade85f4a 100644 --- a/docs/reference/stan_car.html +++ b/docs/reference/stan_car.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 @@ -103,8 +103,8 @@

Arguments

re

To include a varying intercept (or "random effects") term, alpha_re, specify the grouping variable here using formula syntax, as in ~ ID. Then, alpha_re is a vector of parameters added to the linear predictor of the model, and:

-

alpha_re ~ N(0, alpha_tau)
-alpha_tau ~ Student_t(d.f., location, scale).

+

alpha_re ~ N(0, alpha_tau)
+alpha_tau ~ Student_t(d.f., location, scale).

With the CAR model, any alpha_re term should be at a different level or scale than the observations; that is, at a different scale than the autocorrelation structure of the CAR model itself.

diff --git a/docs/reference/stan_esf.html b/docs/reference/stan_esf.html index 548cca55..85d5f473 100644 --- a/docs/reference/stan_esf.html +++ b/docs/reference/stan_esf.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 @@ -110,8 +110,8 @@

Arguments

re

To include a varying intercept (or "random effects") term, alpha_re, specify the grouping variable here using formula syntax, as in ~ ID. Then, alpha_re is a vector of parameters added to the linear predictor of the model, and:

-

alpha_re ~ N(0, alpha_tau)
-alpha_tau ~ Student_t(d.f., location, scale).

+

alpha_re ~ N(0, alpha_tau)
+alpha_tau ~ Student_t(d.f., location, scale).

data
diff --git a/docs/reference/stan_glm.html b/docs/reference/stan_glm.html index 0377e74c..6dcc5e1a 100644 --- a/docs/reference/stan_glm.html +++ b/docs/reference/stan_glm.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 @@ -98,8 +98,8 @@

Arguments

re

To include a varying intercept (or "random effects") term, alpha_re, specify the grouping variable here using formula syntax, as in ~ ID. Then, alpha_re is a vector of parameters added to the linear predictor of the model, and:

-

alpha_re ~ N(0, alpha_tau)
-alpha_tau ~ Student_t(d.f., location, scale).

+

alpha_re ~ N(0, alpha_tau)
+alpha_tau ~ Student_t(d.f., location, scale).

data
@@ -278,9 +278,9 @@

Spatially lagged covariates (SLX)shape2mat), \(WX\) is the mean neighboring value of \(X\), and \(\gamma\) is a coefficient vector. This specifies a regression with spatially lagged covariates. SLX terms can specified by providing a formula to the slx argument:

-

stan_glm(y ~ x1 + x2, slx = ~ x1 + x2, \...),

+

stan_glm(y ~ x1 + x2, slx = ~ x1 + x2, \...),

which is a shortcut for

-

stan_glm(y ~ I(W \%*\% x1) + I(W \%*\% x2) + x1 + x2, \...)

+

stan_glm(y ~ I(W \%*\% x1) + I(W \%*\% x2) + x1 + x2, \...)

SLX terms will always be prepended to the design matrix, as above, which is important to know when setting prior distributions for regression coefficients.

For measurement error (ME) models, the SLX argument is the only way to include spatially lagged covariates since the SLX term needs to be re-calculated on each iteration of the MCMC algorithm.

diff --git a/docs/reference/stan_icar.html b/docs/reference/stan_icar.html index eb7e5e58..957b6e08 100644 --- a/docs/reference/stan_icar.html +++ b/docs/reference/stan_icar.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 @@ -106,8 +106,8 @@

Arguments

re

To include a varying intercept (or "random effects") term, alpha_re, specify the grouping variable here using formula syntax, as in ~ ID. Then, alpha_re is a vector of parameters added to the linear predictor of the model, and:

-

alpha_re ~ N(0, alpha_tau)
-alpha_tau ~ Student_t(d.f., location, scale).

+

alpha_re ~ N(0, alpha_tau)
+alpha_tau ~ Student_t(d.f., location, scale).

Before using this term, read the Details section and the type argument. Specifically, if you use type = bym, then an observational-level re term is already included in the model. (Similar for type = bym2.)

@@ -315,48 +315,48 @@

BYM2

$$ \tau \sim Gaussian(0, 1) $$

The terms \(\tilde{\phi}\), \(\tilde{\theta}\) are standard normal deviates, \(\rho\) is restricted to values between zero and one, and \(S\) is the 'scale_factor' (a constant term provided by the user). By default, the 'scale_factor' is equal to one, so that it does nothing. Riebler et al. (2016) argue that the interpretation or meaning of the scale of the ICAR model depends on the graph structure of the connectivity matrix \(C\). This implies that the same prior distribution assigned to \(\tau_s\) will differ in its implications if \(C\) is changed; in other words, the priors are not transportable across models, and models that use the same nominal prior actually have different priors assigned to \(\tau_s\).

Borrowing R code from Morris (2017) and following Freni-Sterrantino et al. (2018), the following R code can be used to create the 'scale_factor' \(S\) for the BYM2 model (note, this requires the INLA R package), given a spatial adjacency matrix, \(C\):

-

## create a list of data for stan_icar
-icar.data <- geostan::prep_icar_data(C)
-## calculate scale_factor for each of k connected group of nodes
-k <- icar.data$k
-scale_factor <- vector(mode = "numeric", length = k)
-for (j in 1:k) {
-  g.idx <- which(icar.data$comp_id == j) 
-  if (length(g.idx) == 1) {
-       scale_factor[j] <- 1
-       next
-    }    
-  Cg <- C[g.idx, g.idx] 
-  scale_factor[j] <- scale_c(Cg) 
-}

+

## create a list of data for stan_icar
+icar.data <- geostan::prep_icar_data(C)
+## calculate scale_factor for each of k connected group of nodes
+k <- icar.data$k
+scale_factor <- vector(mode = "numeric", length = k)
+for (j in 1:k) {
+  g.idx <- which(icar.data$comp_id == j) 
+  if (length(g.idx) == 1) {
+       scale_factor[j] <- 1
+       next
+    }    
+  Cg <- C[g.idx, g.idx] 
+  scale_factor[j] <- scale_c(Cg) 
+}

This code adjusts for 'islands' or areas with zero neighbors, and it also handles disconnected graph structures (see Donegan and Morris 2021). Following Freni-Sterrantino (2018), disconnected components of the graph structure are given their own intercept term; however, this value is added to \(\phi\) automatically inside the Stan model. Therefore, the user never needs to make any adjustments for this term. (To avoid complications from using a disconnected graph structure, you can apply a proper CAR model instead of the ICAR: stan_car).

Note, the code above requires the scale_c function; it has package dependencies that are not included in geostan. To use scale_c, you have to load the following R function:

-

#' compute scaling factor for adjacency matrix, accounting for differences in spatial connectivity 
-#'
-#' @param C connectivity matrix
-#'
-#' @details
-#'
-#' Requires the following packages: 
-#'
-#' library(Matrix)
-#' library(INLA);
-#' library(spdep)
-#' library(igraph)
-#'  
-#' @source
-#'
-#'   Morris, Mitzi (2017). Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data. <https://mc-stan.org/users/documentation/case-studies/icar_stan.html>
-#'
-scale_c <- function(C) {
- geometric_mean <- function(x) exp(mean(log(x))) 
- N = dim(C)[1]
- Q =  Diagonal(N, rowSums(C)) - C
- Q_pert = Q + Diagonal(N) * max(diag(Q)) * sqrt(.Machine$double.eps)
- Q_inv = inla.qinv(Q_pert, constr=list(A = matrix(1,1,N),e=0))
- scaling_factor <- geometric_mean(Matrix::diag(Q_inv)) 
- return(scaling_factor) 
-}

+

#' compute scaling factor for adjacency matrix, accounting for differences in spatial connectivity 
+#'
+#' @param C connectivity matrix
+#'
+#' @details
+#'
+#' Requires the following packages: 
+#'
+#' library(Matrix)
+#' library(INLA);
+#' library(spdep)
+#' library(igraph)
+#'  
+#' @source
+#'
+#'   Morris, Mitzi (2017). Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data. <https://mc-stan.org/users/documentation/case-studies/icar_stan.html>
+#'
+scale_c <- function(C) {
+ geometric_mean <- function(x) exp(mean(log(x))) 
+ N = dim(C)[1]
+ Q =  Diagonal(N, rowSums(C)) - C
+ Q_pert = Q + Diagonal(N) * max(diag(Q)) * sqrt(.Machine$double.eps)
+ Q_inv = inla.qinv(Q_pert, constr=list(A = matrix(1,1,N),e=0))
+ scaling_factor <- geometric_mean(Matrix::diag(Q_inv)) 
+ return(scaling_factor) 
+}

diff --git a/docs/reference/stan_sar.html b/docs/reference/stan_sar.html index 3e2393e6..3bb59861 100644 --- a/docs/reference/stan_sar.html +++ b/docs/reference/stan_sar.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0
@@ -103,8 +103,8 @@

Arguments

re

To include a varying intercept (or "random effects") term, alpha_re, specify the grouping variable here using formula syntax, as in ~ ID. Then, alpha_re is a vector of parameters added to the linear predictor of the model, and:

-

alpha_re ~ N(0, alpha_tau)
-alpha_tau ~ Student_t(d.f., location, scale).

+

alpha_re ~ N(0, alpha_tau)
+alpha_tau ~ Student_t(d.f., location, scale).

With the SAR model, any alpha_re term should be at a different level or scale than the observations; that is, at a different scale than the autocorrelation structure of the SAR model itself.

diff --git a/docs/reference/waic.html b/docs/reference/waic.html index 83e318d3..4e7f7083 100644 --- a/docs/reference/waic.html +++ b/docs/reference/waic.html @@ -17,7 +17,7 @@ geostan - 0.5.4 + 0.6.0 diff --git a/inst/stan/parts/data.stan b/inst/stan/parts/data.stan index d63670f0..ec03310e 100644 --- a/inst/stan/parts/data.stan +++ b/inst/stan/parts/data.stan @@ -6,11 +6,13 @@ int center_x; -// censored counts +// missing y int n_mis; int n_obs; array[n_mis] int y_mis_idx; array[n_obs] int y_obs_idx; + +// censored counts int censor_point; // outcome diff --git a/inst/stan/parts/gen_quants_declaration.stan b/inst/stan/parts/gen_quants_declaration.stan index aa261023..4a23b0cc 100644 --- a/inst/stan/parts/gen_quants_declaration.stan +++ b/inst/stan/parts/gen_quants_declaration.stan @@ -1,5 +1,5 @@ // any other declarations in this block must be made *before* this included file - vector[is_auto_gaussian ? 1 : n] log_lik; + vector[is_auto_gaussian ? 1 : n_obs] log_lik; diff --git a/inst/stan/parts/gen_quants_expression.stan b/inst/stan/parts/gen_quants_expression.stan index a476e662..6795bd96 100644 --- a/inst/stan/parts/gen_quants_expression.stan +++ b/inst/stan/parts/gen_quants_expression.stan @@ -18,17 +18,17 @@ eigenvalues_w, n); } - for (i in 1:n) { + for (i in 1:n_obs) { if (is_student) { - log_lik[i] = student_t_lpdf(y[i] | nu[1], fitted[i], sigma[has_sigma]); + log_lik[i] = student_t_lpdf(y[y_obs_idx[i]] | nu[1], fitted[y_obs_idx[i]], sigma[has_sigma]); } if (is_gaussian) { - log_lik[i] = normal_lpdf(y[i] | fitted[i], sigma[has_sigma]); + log_lik[i] = normal_lpdf(y[y_obs_idx[i]] | fitted[y_obs_idx[i]], sigma[has_sigma]); } if (is_poisson) { - log_lik[i] = poisson_lpmf(y_int[i] | fitted[i]); + log_lik[i] = poisson_lpmf(y_int[y_obs_idx[i]] | fitted[y_obs_idx[i]]); } if (is_binomial) { - log_lik[i] = binomial_lpmf(y_int[i] | trials[i], fitted[i]); + log_lik[i] = binomial_lpmf(y_int[y_obs_idx[i]] | trials[y_obs_idx[i]], fitted[y_obs_idx[i]]); } } diff --git a/inst/stan/parts/model.stan b/inst/stan/parts/model.stan index 7b03b88f..1d8b9680 100644 --- a/inst/stan/parts/model.stan +++ b/inst/stan/parts/model.stan @@ -5,6 +5,7 @@ if (dx_all) target += normal_lpdf(append_row(gamma, beta) | prior_beta_location, prior_beta_scale); if (has_sigma) target += student_t_lpdf(sigma | prior_sigma[1], prior_sigma[2], prior_sigma[3]); if (is_student) target += gamma_lpdf(nu[1] | prior_t_nu[1], prior_t_nu[2]); + // data models (observational uncertainty) if (dx_me) { if (spatial_me) { @@ -32,20 +33,24 @@ target += normal_lpdf(mu_x_true | prior_mux_true_location, prior_mux_true_scale); target += student_t_lpdf(sigma_x_true | prior_sigmax_true_df, prior_sigmax_true_location, prior_sigmax_true_scale); } + // partial pooling of observations across all groups/geographies (varying intercept) if (has_re) { target += normal_lpdf(alpha_re | 0, alpha_tau[has_re]); target += student_t_lpdf(alpha_tau[has_re] | prior_alpha_tau[1], prior_alpha_tau[2], prior_alpha_tau[3]); } + // process model (likelihood) if (!prior_only) { - if (is_student) target += student_t_lpdf(y | nu[1], fitted, sigma[has_sigma]); - if (is_gaussian) target += normal_lpdf(y | fitted, sigma[has_sigma]); + if (is_student) target += student_t_lpdf(y[y_obs_idx] | nu[1], fitted[y_obs_idx], sigma[has_sigma]); + if (is_gaussian) target += normal_lpdf(y[y_obs_idx] | fitted[y_obs_idx], sigma[has_sigma]); if (is_poisson) { target += poisson_lpmf(y_int[y_obs_idx] | fitted[y_obs_idx]); if (censor_point > 0) target += poisson_lcdf(censor_point | fitted[y_mis_idx]); } - if (is_binomial) target += binomial_lpmf(y_int | trials, fitted); + if (is_binomial) { + target += binomial_lpmf(y_int[y_obs_idx] | trials[y_obs_idx], fitted[y_obs_idx]); + } } // ICAR @@ -59,6 +64,7 @@ phi_tilde ~ icar_normal(spatial_scale[1], node1, node2, k, group_size, group_idx, has_theta); if (m) target += normal_lpdf(alpha_phi | 0, prior_alpha[2]); } + // ESF if (dev) { target += std_normal_lpdf(z); @@ -68,6 +74,7 @@ target += inv_gamma_lpdf(aux2_global[1] | 0.5, 0.5); // .5 * nu_local, .5 * nu_global, both = 1 target += inv_gamma_lpdf(caux[1] | 0.5*slab_df, 0.5*slab_df); } + // CAR if (car) { target += student_t_lpdf(car_scale[1] | prior_sigma[1], prior_sigma[2], prior_sigma[3]); diff --git a/man/figures/README-unnamed-chunk-5-1.png b/man/figures/README-unnamed-chunk-5-1.png index 29d309fc..703ce6e8 100644 Binary files a/man/figures/README-unnamed-chunk-5-1.png and b/man/figures/README-unnamed-chunk-5-1.png differ diff --git a/tests/testthat/test-missing-y.R b/tests/testthat/test-missing-y.R new file mode 100644 index 00000000..c531127f --- /dev/null +++ b/tests/testthat/test-missing-y.R @@ -0,0 +1,76 @@ +iter=10 +refresh = 0 +source("helpers.R") + +context("missing y") + +test_that("Poisson missing y", { + data(georgia) + N = nrow(georgia) + georgia$deaths.female[sample.int(N, size = 12)] <- NA + georgia$deaths.female[sample.int(N, size = 3)] <- 0 + SW( + fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)), + data = georgia, + car_parts = prep_car_data(shape2mat(georgia, "B")), + chains = 1, + family = poisson(), + iter = iter, + refresh = refresh) + ) + expect_geostan(fit) +}) + +test_that("Binomial missing y", { + data(georgia) + A = shape2mat(georgia, "B") + N = nrow(A) + georgia$deaths.female[sample.int(N, size = 25)] <- NA + georgia$y <- round(georgia$deaths.female / 10) + georgia$y[sample.int(N, 5)] <- 0 + georgia$f <- round(4 * georgia$deaths.female / 10) + georgia$f[which(!is.na(georgia$y))[1]] <- NA + SW( + fit <- stan_glm(cbind(y, f) ~ 1, + data = georgia, + # car_parts = prep_car_data(shape2mat(georgia, "B")), + chains = 1, + family = binomial(), + iter = iter, + refresh = refresh) + ) + expect_geostan(fit) + SW( + + fit <- stan_icar(cbind(y, f) ~ 1, + data = georgia, + C = A, + chains = 1, + family = binomial(), + iter = iter, + refresh = refresh) + + ) + +}) + + +test_that("ESF missing y", { + data(georgia) + georgia$deaths.female[1:10] <- NA + georgia$y <- georgia$deaths.female + georgia$f <- round(4 * georgia$deaths.female) + SW( + fit <- stan_esf(cbind(y, f) ~ log(income), + data = georgia, + C = shape2mat(georgia, "B"), + chains = 1, + family = binomial(), + iter = iter, + refresh = refresh) + ) + expect_geostan(fit) +}) + + + diff --git a/tests/testthat/test-stan-glm.R b/tests/testthat/test-stan-glm.R index 3633d9f5..e2764fa5 100644 --- a/tests/testthat/test-stan-glm.R +++ b/tests/testthat/test-stan-glm.R @@ -10,7 +10,7 @@ test_that("GLM works", { data = sentencing, chains = 1, family = poisson(), - init_r = 0.2, + init_r = 0.2, iter = iter, refresh = refresh) ) diff --git a/vignettes/custom-spatial-models.Rmd b/vignettes/custom-spatial-models.Rmd index bb51bee1..53675bab 100644 --- a/vignettes/custom-spatial-models.Rmd +++ b/vignettes/custom-spatial-models.Rmd @@ -29,9 +29,9 @@ This tutorial is written with the assumption that the reader is already familiar The CAR model is a multivariate normal distribution with covariance matrix $$\Sigma = (I - \rho \cdot C)^{-1} M,$$ where $I$ is the identity matrix, $\rho$ a spatial autocorrelation parameter which may be positive or negative, $C$ is a sparse connectivity matrix, and $M$ is a diagonal matrix with scale parameters $\tau_i^2$. There is typically a single scale parameter $\tau$ multiplied by weights $\delta_i$. The term $\tau^2 \cdot \delta_i$ is the conditional variance pertaining to $y_i | y_{n(i)}$ where $n(i)$ lists the areas that neighbor the $i^{th}$ one. -The defining feature of the CAR model is that, similar to time-series autoregression, the expectation for the value at the $i^{th}$ site given any or all other values of $y$ besides $i$ ($y_{-i}$) are known, is a function of the neighboring values: $$\mathbb{E}[y_i | y_{-i}] = \mu_ + \sum_{j \in n(i)} \rho \cdot c_{ij} (y_j - \mu).$$ +The defining feature of the CAR model is that, similar to time-series autoregression, the expectation for the value at the $i^{th}$ site given any or all other values of $y$ besides $i$ ($y_{-i}$) are known, is a function of the neighboring values: $$\mathbb{E}[y_i | y_{-i}] = \mu_j + \sum_{j \in n(i)} \rho \cdot c_{ij} (y_j - \mu_j).$$ -The CAR function presented here is valid for what is sometimes called the WCAR specification. This is the most commonly employed CAR specification. The connectivity matrix is row-standardized. Let $A$ be a symmetric binary connectivity matrix where $a_{ij}= 1$ only if the $i^{th}$ and $j^{th}$ sites are neighbors, all other entries are zero (including the diagonal entries $i=j$). The elements of $C$ are +The CAR function presented here is valid for what is sometimes called the WCAR specification. This is the most commonly employed CAR specification (see @donegan_2022 for CAR models without this restriction). The connectivity matrix is row-standardized. Let $A$ be a symmetric binary connectivity matrix where $a_{ij}= 1$ only if the $i^{th}$ and $j^{th}$ sites are neighbors, all other entries are zero (including the diagonal entries $i=j$). The elements of $C$ are $$c_{ij} = \frac{a_{ij}}{\sum_j^n a_{ij}} = \frac{a_{ij}}{|n(i)|}.$$ The diagonal of matrix $M$ then is specified by the n-length vector of weights $\delta_i = \tau^2 \cdot \frac{1}{|n(i)|}$. @@ -482,4 +482,5 @@ fit <- stan_sar(log(income / 10e3) ~ log(population / 10e3), data = georgia, One can also use the SAR model as a prior distribution for a parameter vector, just as was done above with the CAR model. + # References diff --git a/vignettes/custom-spatial-models.Rmd~ b/vignettes/custom-spatial-models.Rmd~ deleted file mode 100644 index 27994485..00000000 --- a/vignettes/custom-spatial-models.Rmd~ +++ /dev/null @@ -1,483 +0,0 @@ ---- -title: "Custom spatial models with RStan and geostan" -date: October 2, 2023 -author: Connor Donegan -output: - rmarkdown::html_vignette: - toc: true -header-includes: - - \usepackage{amsmath} -vignette: > - %\VignetteIndexEntry{Custom spatial models with RStan and geostan} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} -bibliography: spatial-me.bib -link-citations: yes ---- - -```{r, include=FALSE} -knitr::opts_chunk$set(eval = FALSE, echo = TRUE) -``` - -The spatial models in geostan use custom Stan functions that are far more efficient than using built-in functions, including the conditional (CAR) and simultaneous spatial autoregressive (SAR) models (both are particular specifications of the multivariate normal distribution). This vignette shows how you can use those custom functions together with some R functions in geostan to build custom spatial models using RStan. - -This tutorial is written with the assumption that the reader is already familiar with [RStan](https://mc-stan.org/users/documentation/). Users are expected to make adjustments to the code as needed, including to prior distributions. For more details and an explanation of the computational approach, see @donegan_2022. For more background on spatial modeling see @haining_2020. - -# CAR models - -The CAR model is a multivariate normal distribution with covariance matrix $$\Sigma = (I - \rho \cdot C)^{-1} M,$$ where $I$ is the identity matrix, $\rho$ a spatial autocorrelation parameter which may be positive or negative, $C$ is a sparse connectivity matrix, and $M$ is a diagonal matrix with scale parameters $\tau_i^2$. There is typically a single scale parameter $\tau$ multiplied by weights $\delta_i$. The term $\tau^2 \cdot \delta_i$ is the conditional variance pertaining to $y_i | y_{n(i)}$ where $n(i)$ lists the areas that neighbor the $i^{th}$ one. - -The defining feature of the CAR model is that, similar to time-series autoregression, the expectation for the value at the $i^{th}$ site given any or all other values of $y$ besides $i$ ($y_{-i}$) are known, is a function of the neighboring values: $$\mathbb{E}[y_i | y_{-i}] = \mu_ + \sum_{j \in n(i)} \rho \cdot c_{ij} (y_j - \mu).$$ - -The CAR function presented here is valid for what is sometimes called the WCAR specification. This is the most commonly employed CAR specification. The connectivity matrix is row-standardized. Let $A$ be a symmetric binary connectivity matrix where $a_{ij}= 1$ only if the $i^{th}$ and $j^{th}$ sites are neighbors, all other entries are zero (including the diagonal entries $i=j$). The elements of $C$ are -$$c_{ij} = \frac{a_{ij}}{\sum_j^n a_{ij}} = \frac{a_{ij}}{|n(i)|}.$$ - -The diagonal of matrix $M$ then is specified by the n-length vector of weights $\delta_i = \tau^2 \cdot \frac{1}{|n(i)|}$. - -The `geostan::prep_car_data` function will complete all of this for you. The user passes in a binary connectivity matrix $A$ and and specifies `style = "WCAR"`, then the function returns a list of data inputs for the Stan model. - -## Autonormal model - -This first example is an autonormal model: $$y \sim Normal(\alpha + x \beta, \Sigma).$$ Here you have $n$ observations of a continuously measured outcome $y$, possibly $k=1$ or more covariates $x$, and you're accounting for spatial autocorrelation in $y$ using the covariance matrix of an $n$-dimensional multivariate normal distribution. - -The following Stan code implements the above model; if you're following along, copy the Stan code and save it in a file inside your working directory; I'll name it "autonormal.stan" and I'll save the name as an R variable: - -```{r} -autonormal_file <- "autonormal.stan" -``` - -Here's the Stan code: - -```{stan output.var= "auto_normal"} -functions { -#include wcar-lpdf.stan -} - -data { - // data - int n; - int k; - vector[n] y; - matrix[n, k] x; - - // CAR parts - int nAx_w; - int nC; - vector[nAx_w] Ax_w; - array[nAx_w] int Ax_v; - array[n + 1] int Ax_u; - array[nC] int Cidx; - vector[n] Delta_inv; - real log_det_Delta_inv; - vector[n] lambda; -} - -parameters { - // spatial autocorrelation (SA) parameter - real rho; - - // scale parameter - real tau; - - // intercept - real alpha; - - // coefficients - vector[k] beta; -} - -model { - vector[n] mu = alpha + x * beta; - - // Likelihood: y ~ Normal(Mu, Sigma) - target += wcar_normal_lpdf(y | - mu, tau, rho, // mean, scale, SA - Ax_w, Ax_v, Ax_u, // stuff from prep_car_data - Delta_inv, - log_det_Delta_inv, - lambda, - n); - - // prior for scale parameter - target += student_t_lpdf(tau | 10, 0, 5); - - // prior for beta - target += normal_lpdf(beta | 0, 5); - - // prior for intercept - target += normal_lpdf(alpha | 0, 5); -} - -``` - -The input file "wcar-lpdf.stan" contains the following code (save this code inside your working directory and name it "wcar-lpdf.stan"): - -```{stan output.var = 'wcar_fun', eval = FALSE} -/** - * Log probability density of the conditional autoregressive (CAR) model: WCAR specifications only - * - * @param y Process to model - * @param mu Mean vector - * @param tau Scale parameter - * @param rho Spatial dependence parameter - * @param A_w Sparse representation of the symmetric connectivity matrix, A - * @param A_v Column indices for values in A_w - * @param A_u Row starting indices for values in A_u - * @param D_inv The row sums of A; i.e., the diagonal elements from the inverse of Delta, where M = Delta * tau^2 is a diagonal matrix containing the conditional variances. - * @param log_det_D_inv Log determinant of Delta inverse. - * @param lambda Eigenvalues of C (or of the symmetric, scaled matrix Delta^{-1/2}*C*Delta^{1/2}); for the WCAR specification, C is the row-standardized version of W. - * @param n Length of y - * - * @return Log probability density of CAR prior up to additive constant - */ -real wcar_normal_lpdf(vector y, vector mu, - real tau, real rho, - vector A_w, array[] int A_v, array[] int A_u, - vector D_inv, real log_det_D_inv, - vector lambda, - int n) { - vector[n] z = y - mu; - real ztDz; - real ztAz; - vector[n] ldet_ImrhoC; - ztDz = (z .* D_inv)' * z; - ztAz = z' * csr_matrix_times_vector(n, n, A_w, A_v, A_u, z); - for (i in 1:n) ldet_ImrhoC[i] = log1m(rho * lambda[i]); - return 0.5 * ( - -n * log( 2 * pi() ) - -2 * n * log(tau) - + log_det_D_inv - + sum(ldet_ImrhoC) - - (1 / tau^2) * (ztDz - rho * ztAz)); -} -``` - -From R, you can use the following code to prepare the input data ($y$, $x$, etc.). I use `prep_car_data` to get a list of parts for the CAR model, then I append the outcome data to the same list and pass it all to a Stan model to draw samples. - -```{r} -library(rstan) -library(geostan) -data(georgia) - -A <- shape2mat(georgia, "B") -car_list <- prep_car_data(A, style = "WCAR") - -# add data -car_list$y <- log(georgia$income / 10e3) -car_list$x <- matrix(log(georgia$population / 10e3), ncol = 1) -car_list$k <- ncol(car_list$x) - -# compile Stan model from file -car_model <- stan_model(autonormal_file) - -# sample from model -samples <- sampling(car_model, data = car_list, iter = 1e3) -``` - -The same results can be obtained using `geostan::stan_car`: - -```{r} -fit <- stan_car(log(income / 10e3) ~ log(population / 10e3), - data = georgia, car = car_list, iter = 1e3) -``` - - -## Poisson models - -Disease mapping is a common use for CAR models, though these models have many applications. In this class of models, the CAR model is assigned as the prior distribution to a parameter vector $\phi$, which is used to model disease incidence rates across small areas like counties. The disease data consist of counts $y$ together with the size of population at risk $p$, or possibly a different denominator such as the expected number of cases $E$ (which occurs when using indirect age-standardization). These models have the form -\begin{equation} -\begin{aligned} - &y \sim Poisson(e^\phi) \\ - &\phi \sim Normal(\mu, \Sigma) \\ - &\mu = log(p) + \alpha + x \beta. -\end{aligned} -\end{equation} - -Here you see that the linear predictor $\mu$ is embedded with the CAR model. - -Here is another way of writing down the very same model: -\begin{equation} -\begin{aligned} - &y \sim Poisson(e^\eta) \\ - &\eta = log(p) + \alpha + x \beta + \phi \\ - &\phi \sim Normal(0, \Sigma) \\ -\end{aligned} -\end{equation} - -It is possible to build a Stan modeling using either one of these two formulations. They are substantively equivalent, *but* you may find that one is far more amenable to Stan's MCMC algorithm. The former specification ($\mu$ inside the CAR model) is less commonly presented in papers, but in this author's experience it is often far more stable computationally than forcing the CAR model to have zero mean. (You may very well encounter a case where the opposite is true.) - -The purpose of the offset term is sometimes unclear in introductory texts. The offset term $p$ is from the denominator of the rates $\frac{y}{p}$. The expectation or mean of the model is -\begin{equation} -\begin{aligned} - &\mathbb{E}[y] = e^{log(p) + \mu} = e^{log(p)} \cdot e^{\mu} = p \cdot e^\mu \\ - &\mathbb{E}\Bigl[ \frac{y}{p} \Bigr] = e^\mu -\end{aligned} -\end{equation} -The smaller is the denominator, the less informative is the rate with respect to the characteristic level of risk bearing upon the population of the given period and place. With small denominators, chance renders the rates uninformative (and unstable over time and space), and the Poisson model accounts for this. - -The following Stan code provides a template for a simple disease mapping model using the CAR model as a prior for $\phi$. As before, you can save the code in your working directory and give it a name like "car_poisson.stan". I'll store the file name in my R environment: - -```{r} -car_poisson_file <- "car_poisson.stan" -``` - -```{stan output.var = "car_poisson"} -functions { -#include wcar-lpdf.stan -} - -data { - // data - int n; - int k; - array[n] int y; - matrix[n, k] x; - vector[n] const_offset; - - // CAR parts - int nAx_w; - int nC; - vector[nAx_w] Ax_w; - array[nAx_w] int Ax_v; - array[n + 1] int Ax_u; - array[nC] int Cidx; - vector[n] Delta_inv; - real log_det_Delta_inv; - vector[n] lambda; -} - -parameters { - // spatial autocorrelation (SA) parameter - real rho; - - // scale parameter - real tau; - - // intercept - real alpha; - - // coefficients - vector[k] beta; - - // SA trend component - vector[n] phi; -} - - -model { - vector[n] mu = const_offset + alpha + x * beta; - - // Likelihood: y ~ Poisson(e^[phi]) - target += poisson_lpmf(y | exp(phi)); - - // phi ~ Normal(Mu, Sigma) - target += wcar_normal_lpdf(phi | - mu, tau, rho, // mean, scale, SA - Ax_w, Ax_v, Ax_u, // stuff from prep_car_data - Delta_inv, - log_det_Delta_inv, - lambda, - n); - - // prior for scale parameter - target += student_t_lpdf(tau | 10, 0, 1); - - // prior for beta - target += normal_lpdf(beta | 0, 5); - - // prior for intercept - target += normal_lpdf(alpha | 0, 5); -} - -``` - -Again, you can use `geostan::prep_car_data` to easily convert the spatial weights matrix into the list of required inputs for the CAR model: - -```{r} -library(rstan) -library(geostan) -data(georgia) - -A <- shape2mat(georgia, "B") -car_list <- prep_car_data(A, style = "WCAR") - -# add data -car_list$y <- georgia$deaths.male -car_list$const_offset <- log(georgia$pop.at.risk.male) -car_list$x <- matrix(log(georgia$income / 10e3), ncol = 1) -car_list$k <- ncol(car_list$x) - -# compile Stan model from file -car_poisson <- stan_model(car_poisson_file) - -# sample from model -samples <- sampling(car_poisson, - data = car_list, - iter = 1e3) -``` - -The same results can be obtained using `geostan::stan_car`: - -```{r} -fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + log(income / 10e3), - data = georgia, car = car_list, family = poisson(), iter = 1e3) -``` - -# SAR models - -The simultaneously-specified spatial autoregression (SAR) is written as -\begin{equation} - \begin{aligned} - y = \mu + (I - \rho \cdot W)^{-1} \epsilon \\ - \epsilon \sim Normal(0, \sigma^2 \cdot I) - \end{aligned} -\end{equation} -where $W$ is a row-standardized spatial weights matrix, $I$ is the n-by-n identity matrix, and $\rho$ is a spatial autocorrelation parameter. This is also a multivariate normal distribution but with a different covariance matrix than the CAR model: -\begin{equation} - \Sigma = \sigma^2 \cdot (I - \rho \cdot W)^{-1} (I - \rho \cdot W^T)^{-1}, -\end{equation} -where $T$ is the matrix transpose operator. The SA parameter $\rho$ for the SAR model has a more intuitive connection to the degree of SA than the CAR model (which is usually above .95 for moderately high SA). - -The Stan function that is used by `geostan::stan_sar` is as follows (the R code below will assume that you have saved this as "sar-lpdf.stan"): - -```{stan output.var = "sar_fun", eval = FALSE} -/** - * Log probability density of the simultaneous autoregressive (SAR) model (spatial error model) - * - * @param y Process to model - * @param mu Mean vector - * @param sigma Scale parameter - * @param rho Spatial dependence parameter - * @param ImW Sparse representation of (I - W): non-zero values only - * @param ImW_v Column indices for values in ImW - * @param ImW_u Row starting indices for values in ImW - * @param Widx Indices for the off-diagonal elements in ImC - * @param lambda Eigenvalues of W - * @param n Length of y - * - * @return Log probability density of SAR model up to additive constant -*/ - real sar_normal_lpdf(vector y, - vector mu, - real sigma, - real rho, - vector ImW, - array[] int ImW_v, - array[] int ImW_u, - array[] int Widx, - vector lambda, - int n) { - vector[n] z = y - mu; - real tau = 1 / sigma^2; - vector[num_elements(ImW)] ImrhoW = ImW; // (I - rho W) - vector[n] ImrhoWz; // (I - rho * W) * z - real zVz; - real ldet_V = 2 * sum(log1m(rho * lambda)) - 2 * n * log(sigma); - ImrhoW[Widx] = rho * ImW[Widx]; - ImrhoWz = csr_matrix_times_vector(n, n, ImrhoW, ImW_v , ImW_u , z); - zVz = tau * dot_self(ImrhoWz); - return 0.5 * (-n * log(2 * pi()) + ldet_V - zVz); - } - -``` - -The following Stan model provides an example of how to use this function to build a SAR model. Again, its an autonormal model for continuous outcome variable with $k$ covariates. - -```{r} -sar_model_file <- "sar_model.stan" -``` - -```{stan output.var = "auto_sar"} -functions { -#include sar-lpdf.stan -} - -data { - // data - int n; - int k; - vector[n] y; - matrix[n, k] x; - - -// SAR - int nImW_w; - int nW; - vector[nImW_w] ImW_w; - array[nImW_w] int ImW_v; - array[n + 1] int ImW_u; - array[nW] int Widx; - vector[n] eigenvalues_w; -} - -parameters { - // SA parameter - real rho; - - // scale parameter - real sigma; - - // intercept - real alpha; - - // coefficients - vector[k] beta; -} - -model{ - vector[n] mu = alpha + x * beta; - - // Likelihood: Y ~ Normal(Mu, Sigma) - target += sar_normal_lpdf(y | - mu, sigma, rho, - ImW_w, - ImW_v, - ImW_u, - Widx, - eigenvalues_w, - n); - - // prior for scale parameter - target += student_t_lpdf(sigma | 10, 0, 5); - - // prior for beta - target += normal_lpdf(beta | 0, 5); - - // prior for intercept - target += normal_lpdf(alpha | 0, 5); -} - -``` - - -```{r} -library(geostan) -library(rstan) -data(georgia) - -W <- shape2mat(georgia, "W") -sar_list <- prep_sar_data(W) - -# add data -sar_list$y <- log(georgia$income / 10e3) -sar_list$x <- matrix(log(georgia$population / 10e3), ncol = 1) -sar_list$k <- ncol(sar_list$x) - -# compile Stan model from file -sar_model <- stan_model(sar_model_file) - -# sample from model -samples <- sampling(sar_model, data = sar_list, iter = 1e3) -``` - -We can draw samples from the same model using `geostan::stan_sar`: - -```{r} -sar_dl <- prep_sar_data(W) -fit <- stan_sar(log(income / 10e3) ~ log(population / 10e3), data = georgia, - sar_parts = sar_dl, iter = 1e3) -``` - -One can also use the SAR model as a prior distribution for a parameter vector, just as was done above with the CAR model. - -# References