diff --git a/.gitignore b/.gitignore index a5c47354..43f75f04 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ cran-comments.md /doc/ /Meta/ CRAN-RELEASE +CRAN-SUBMISSION diff --git a/DESCRIPTION b/DESCRIPTION index 197ff1f6..834934ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: geostan Title: Bayesian Spatial Analysis -Version: 0.5.4 +Version: 0.6.0 Date: 2024-03-01 URL: https://connordonegan.github.io/geostan/ BugReports: https://github.com/ConnorDonegan/geostan/issues diff --git a/NEWS.md b/NEWS.md index 3a13e1ae..bd168f6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,16 @@ +# geostan 0.6.0 + +## New Additions + +The model fitting functions (`stan_glm`, `stan_car`, etc.) now allow for missing data in the outcome variable and a new vignette provides the details. This functionality is not available for auto-Gaussian models - that is, CAR and SAR models that have been fit to continuous outcome variables - but is available for all other available models (including eigenvector spatial filtering `stan_esf` models for continuous outcomes, and all models for count outcomes [binomial and Poisson models]). + # geostan 0.5.4 Minor updates to the vignettees and documentation, also re-compiled geostan models using the latest StanHeaders (fixing an error on CRAN). # geostan 0.5.3 -### Minor changes +## Minor changes The `gamma` function (which is available to help set prior distributions) has been renamed to `geostan::gamma2` to avoid conflict with `base::gamma`. diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 5448c614..05d12278 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -1306,9 +1306,11 @@ exp_pars <- function(formula, data, C) { MCM <- M %*% C %*% M eigens <- eigen(MCM, symmetric = TRUE) npos <- sum(eigens$values > 0) - sa <- mc(residuals(lm(formula, data = data)), C) + res <- residuals(lm(formula, data = data)) + obs_idx <- which(!is.na(res)) + sa <- mc(res[obs_idx], C[obs_idx, obs_idx]) X <- model.matrix(formula, data) - E_sa <- expected_mc(X, C) + E_sa <- expected_mc(X, C[obs_idx, obs_idx]) Sigma_sa <- sqrt( 2 / nlinks ) z_sa <- (sa - E_sa) / Sigma_sa if (z_sa < -.59) { diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R index 5bc972da..136947f5 100644 --- a/R/geostan_fit-methods.R +++ b/R/geostan_fit-methods.R @@ -32,7 +32,7 @@ #' @method print geostan_fit #' @rdname print_geostan_fit print.geostan_fit <- function(x, - probs = c(0.025, 0.25, 0.5, 0.75, 0.975), + probs = c(0.025, 0.2, 0.5, 0.8, 0.975), digits = 3, pars = NULL, ...) { pars <- c(pars, "intercept") @@ -61,7 +61,7 @@ print.geostan_fit <- function(x, cat("Link function: ", x$family$link, "\n") cat("Residual Moran Coefficient: ", x$diagnostic["Residual_MC"], "\n") if (!is.na(x$diagnostic["WAIC"])) cat("WAIC: ", x$diagnostic["WAIC"], "\n") - cat("Observations: ", nrow(x$data), "\n") + cat("Observations: ", x$N, "\n") cat("Data models (ME): ") if (x$ME$has_me) { cat(paste(names(x$ME$se), sep = ", ")) diff --git a/R/internal-functions.R b/R/internal-functions.R index e7dfa8fb..7de40761 100644 --- a/R/internal-functions.R +++ b/R/internal-functions.R @@ -79,7 +79,6 @@ SLX <- function(f, DF, x, W) { #' @param y_mis_idx Index of censored counts in the outcome, if any. make_data <- function(frame, x, y_mis_idx) { y <- model.response(frame) - if (length(y_mis_idx) > 0) y[y_mis_idx] <- NA offset <- model.offset(frame) if (is.null(offset)) return(cbind(y, x)) return(cbind(y, offset, x)) @@ -127,25 +126,37 @@ inv_logit <- function(x) exp(x)/(1 + exp(x)) #' #' @importFrom stats sd #' @noRd -make_priors <- function(user_priors = NULL, y, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) { +make_priors <- function(user_priors = NULL, y, trials, x, hs_global_scale, scaling_factor = 2, link = c("identity", "log", "logit"), EV, offset) { link <- match.arg(link) if (link == "identity") { - scale.y <- sd(y) + y <- na.omit(y) + scale.y <- max(sd(y), .1) alpha_scale <- max(4 * sd(y), 5) alpha_mean <- mean(y) } if (link == "log") { if (any(y == 0)) y[which(y == 0)] <- 1 # local assignment only, not returned y <- log(y / exp(offset)) + if (any(is.infinite(y))) { + any_inf <- which(is.infinite(y)) + y[any_inf] <- NA + } + y <- na.omit(y) alpha_mean <- mean(y) - scale.y <- sd(y) + scale.y <- max(sd(y), .1, na.rm = T) alpha_scale <- max(4 * scale.y, 5) } - if (link == "logit") { - y <- y[,1] / (y[,1] + y[,2]) - alpha_mean <- 0 - scale.y <- sd(y) - alpha_scale <- 5 + if (link == "logit") { + p <- y / trials + y <- log( p / (1-p)) + if (any(is.infinite(y))) { + any_inf <- which(is.infinite(y)) + y[any_inf] <- NA + } + y <- na.omit(y) + alpha_mean <- mean(y) + scale.y <- max(sd(y), 0.1) + alpha_scale <- max(4 * scale.y, 5) } priors <- list() priors$intercept <- normal(location = alpha_mean, scale = alpha_scale) diff --git a/R/prep-censored-data.R b/R/prep-censored-data.R index ea0cb6d4..5143e48a 100644 --- a/R/prep-censored-data.R +++ b/R/prep-censored-data.R @@ -1,11 +1,6 @@ - - - - - #' @description Return index of observed, censored y; elsewhere, use results to replace NAs with zeros #' This will stop if there are missing values and the censor_point argument is not being used; outside of this call, must check that censor_point argument is only used with Poisson likelihood. #' @@ -14,10 +9,12 @@ #' @noRd #' handle_censored_y <- function(censor, frame) { - y_raw <- as.numeric(model.response(frame)) + y_raw <- model.response(frame) + if (inherits(y_raw, "matrix")) { + y_raw <- y_raw[,1] + y_raw[,2] + } y_mis_idx <- which(is.na(y_raw)) y_obs_idx <- which(!is.na(y_raw)) - if (censor == FALSE & length(y_mis_idx) > 0) stop("Missing values in y. If these are censored counts, you may want to use the censor_point argument.") return (list(n_mis = length(y_mis_idx), n_obs = length(y_obs_idx), y_mis_idx = y_mis_idx, @@ -29,5 +26,5 @@ handle_censored_y <- function(censor, frame) { #' @noRd handle_missing_x <- function(frame) { check_NAs <- apply(as.matrix(frame[,-1]), 2, function(c) any(is.na(c))) - if (any(check_NAs)) stop("Missing values found in covariates or offset term. Even when using the censor_point argument, missing values are only permitted in the outcome variable.") + if (any(check_NAs)) stop("Missing values found in covariates or offset term. Missing values are only permitted in the outcome variable.") } diff --git a/R/stan_car.R b/R/stan_car.R index 0d876bf3..90e3b775 100644 --- a/R/stan_car.R +++ b/R/stan_car.R @@ -250,7 +250,7 @@ stan_car <- function(formula, if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "CAR") stopifnot(!missing(data)) check_car_parts(car_parts) - stopifnot(length(car_parts$Delta_inv) == nrow(data)) + stopifnot(car_parts$n == nrow(data)) if (!missing(C)) { stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) stopifnot(all(dim(C) == nrow(data))) @@ -267,13 +267,25 @@ stan_car <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - ## CAR: INCLUDE AUTO-GAUSSIAN AS family_int=5 ------------- - if (family_int %in% c(1,2,5)) y_int <- rep(0, length(y)) - ## ------------- - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "auto_gaussian")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.") + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -339,7 +351,7 @@ stan_car <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, #n = n, # getting n from car_parts, below input_offset = offset, has_re = has_re, @@ -359,10 +371,11 @@ stan_car <- function(formula, ## PRIORS & CAR DATA ------------- is_student <- FALSE ##family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) if (is.null(prior$car_scale)) { priors_made$car_scale <- priors_made$sigma } else { @@ -390,11 +403,6 @@ stan_car <- function(formula, # append me.list to standata standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP, with CAR PARAMETERS [START] ------------- pars <- c(pars, 'intercept', 'car_scale', 'car_rho', 'fitted', 'log_lik') if (family_int < 5) pars <- c(pars, 'log_lambda_mu') @@ -451,6 +459,7 @@ stan_car <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_esf.R b/R/stan_esf.R index b517b579..00f2a507 100644 --- a/R/stan_esf.R +++ b/R/stan_esf.R @@ -229,11 +229,24 @@ stan_esf <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -289,7 +302,7 @@ stan_esf <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -316,12 +329,13 @@ stan_esf <- function(formula, hs_global_scale <- min(1, p0/(dev-p0) / sqrt(n)) } priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, hs_global_scale = hs_global_scale, EV = EV, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) standata <- append_priors(standata, priors_made) esf_dl <- list( dev = dev, @@ -337,11 +351,6 @@ stan_esf <- function(formula, ## ME MODEL STUFF ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP ------------- pars <- c(pars, 'intercept', 'esf', 'beta_ev', 'log_lik', 'fitted') if (!intercept_only) pars <- c(pars, 'beta') @@ -393,6 +402,7 @@ stan_esf <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_glm.R b/R/stan_glm.R index 221cd8ba..6baffb3f 100644 --- a/R/stan_glm.R +++ b/R/stan_glm.R @@ -244,11 +244,24 @@ stan_glm <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -314,7 +327,7 @@ stan_glm <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -331,24 +344,20 @@ stan_glm <- function(formula, ) ## ADD MISSING/OBSERVED INDICES ------------- standata <- c(y_index_list, standata) - ## PRIORS ------------- - is_student <- family$family == "student_t" + ## PRIORS ------------- + is_student <- family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) - standata <- append_priors(standata, priors_made) + offset = offset[y_index_list$y_obs_idx]) + standata <- append_priors(standata, priors_made) ## EMPTY PLACEHOLDERS standata <- c(standata, empty_icar_data(n), empty_esf_data(n), empty_car_data(), empty_sar_data(n)) ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP ------------- pars <- c(pars, 'intercept', 'fitted', 'log_lik') if (!intercept_only) pars <- c(pars, 'beta') @@ -391,7 +400,7 @@ stan_glm <- function(formula, out$re <- re_list out$priors <- priors_made_slim out$x_center <- get_x_center(standata, samples) - out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me) + out$ME <- list(has_me = me.list$has_me, spatial_me = me.list$spatial_me) if (out$ME$has_me) out$ME <- c(out$ME, ME) if (has_re) { out$spatial <- data.frame(par = "alpha_re", method = "Exchangeable") @@ -402,8 +411,8 @@ stan_glm <- function(formula, out$C <- as(C, "sparseMatrix") R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) - } + } + out$N <- length( y_index_list$y_obs_idx ) return(out) } - diff --git a/R/stan_icar.R b/R/stan_icar.R index c5a5ec14..52bf7503 100644 --- a/R/stan_icar.R +++ b/R/stan_icar.R @@ -286,11 +286,24 @@ stan_icar <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - if (family_int %in% c(1,2)) y_int <- rep(0, length(y)) - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "student_t")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -346,7 +359,7 @@ stan_icar <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, n = n, input_offset = offset, has_re = has_re, @@ -363,13 +376,14 @@ stan_icar <- function(formula, ) ## ADD MISSING/OBSERVED INDICES ------------- standata <- c(y_index_list, standata) - ## PRIORS ------------- + ## PRIORS ------------- is_student <- family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) standata <- append_priors(standata, priors_made) ## ICAR DATA [START] ------------- iar.list <- prep_icar_data(C, scale_factor = scale_factor) @@ -382,11 +396,6 @@ stan_icar <- function(formula, ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP with ICAR [START] ------------- pars <- c(pars, 'intercept', 'spatial_scale', 'fitted', 'phi', 'log_lik') if (type == "bym2") pars <- c(pars, 'theta', 'rho') @@ -442,6 +451,7 @@ stan_icar <- function(formula, R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/R/stan_sar.R b/R/stan_sar.R index 99a6fa6b..d4663bb8 100644 --- a/R/stan_sar.R +++ b/R/stan_sar.R @@ -247,6 +247,7 @@ stan_sar <- function(formula, if (family$family == "gaussian" | family$family == "auto_gaussian") family <- auto_gaussian(type = "SAR") stopifnot(!missing(data)) check_sar_parts(sar_parts) + stopifnot(sar_parts$n == nrow(data)) if (!missing(C)) { stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) stopifnot(all(dim(C) == nrow(data))) @@ -260,13 +261,25 @@ stan_sar <- function(formula, if (missing(censor_point)) censor_point <- FALSE mod_frame <- model.frame(formula, tmpdf, na.action = NULL) handle_missing_x(mod_frame) + ## prep Y for missingness // y_index_list <- handle_censored_y(censor_point, mod_frame) - y <- y_int <- model.response(mod_frame) - ## SAR: INCLUDE AUTO-GAUSSIAN AS family_int=6 ------------- - if (family_int %in% c(1,2,6)) y_int <- rep(0, length(y)) - ## ------------- - y[y_index_list$y_mis_idx] <- y_int[y_index_list$y_mis_idx] <- 0 - mod_frame[y_index_list$y_mis_idx, 1] <- 0 + mi_idx <- y_index_list$y_mis_idx + y_tmp <- model.response(mod_frame) + if (family$family == "binomial") { + trials <- y_tmp[,1] + y_tmp[,2] + y <- y_int <- y_tmp[,1] + } + if (family$family == "poisson") { + y <- y_int <- y_tmp + trials <- rep(0, n) + } + if (family$family %in% c("gaussian", "auto_gaussian")) { + y <- y_tmp + y_int <- trials <- rep(0, n) + if ( length(y_index_list$y_mis_idx) > 0) stop("Missing values in y; missing values are not allow for CAR/SAR models with continuous (non-count) outcomes. You may want to try stan_esf.") + } + y[mi_idx] <- y_int[mi_idx] <- trials[mi_idx] <- 0 + ## // if (is.null(model.offset(mod_frame))) { offset <- rep(0, times = n) } else { @@ -332,7 +345,7 @@ stan_sar <- function(formula, prior_only = prior_only, y = y, y_int = y_int, - trials = rep(0, length(y)), + trials = trials, #n = n, # getting n from sar_parts, below input_offset = offset, has_re = has_re, @@ -352,10 +365,11 @@ stan_sar <- function(formula, ## PRIORS & SAR DATA ------------- is_student <- FALSE ##family$family == "student_t" priors_made <- make_priors(user_priors = prior, - y = y, + y = y[y_index_list$y_obs_idx], + trials = trials[y_index_list$y_obs_idx], x = x_full, link = family$link, - offset = offset) + offset = offset[y_index_list$y_obs_idx]) if (is.null(prior$sar_scale)) { priors_made$sar_scale <- priors_made$sigma } else { @@ -375,11 +389,6 @@ stan_sar <- function(formula, ## ME MODEL ------------- me.list <- make_me_data(ME, xraw) standata <- c(standata, me.list) - ## INTEGER OUTCOMES ------------- - if (family$family == "binomial") { - standata$y <- standata$y_int <- y[,1] - standata$trials <- y[,1] + y[,2] - } ## PARAMETERS TO KEEP, with SAR PARAMETERS [START] ------------- pars <- c(pars, 'intercept', 'sar_scale', 'sar_rho', 'fitted', 'log_lik') if (family_int < 6) pars <- c(pars, 'log_lambda_mu') @@ -433,7 +442,8 @@ stan_sar <- function(formula, out$C <- as(C, "sparseMatrix") R <- resid(out, summary = FALSE) out$diagnostic["Residual_MC"] <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) - } + } + out$N <- length( y_index_list$y_obs_idx ) return (out) } diff --git a/README.Rmd b/README.Rmd index dab402ef..5fb011b7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -20,7 +20,7 @@ knitr::opts_chunk$set( ## geostan: Bayesian spatial analysis -The [**geostan**](https://connordonegan.github.io/geostan/) R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) on spatial autocorrelation, spatial measurement error models, and spatial regression with raster layers. +The [**geostan**](https://connordonegan.github.io/geostan/) R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) on spatial autocorrelation, spatial measurement error models, spatial regression with raster layers, and building custom spatial model in Stan. The package is particularly suitable for public health research with spatial data, and complements the [**surveil**](https://connordonegan.github.io/surveil/) R package for time series analysis of public health surveillance data. diff --git a/README.html b/README.html deleted file mode 100644 index 1561109b..00000000 --- a/README.html +++ /dev/null @@ -1,702 +0,0 @@ - - - - -
- - - - - - - - - - - - - - - - - - -The geostan R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package help pages and vignettes on spatial autocorrelation, spatial measurement error models, and spatial regression with raster layers.
-The package is particularly suitable for public health research with spatial data, and complements the surveil R package for time series analysis of public health surveillance data.
-geostan models were built using Stan, a state-of-the-art platform for Bayesian modeling.
- -Statistical models for data recorded across areal units like states, counties, or census tracts.
-Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model.
-Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk with censored observations.
-Tools for visualizing and measuring spatial autocorrelation and map patterns, for exploratory analysis and model diagnostics.
-Interfaces easily with many high-quality R packages for Bayesian modeling.
-Tools for building custom spatial models in Stan.
-Install geostan from CRAN using:
-install.packages("geostan")
All functions and methods are documented (with examples) on the website reference page. See the package vignettes for more on exploratory spatial data analysis, spatial measurement error models, and spatial regression with large raster layers.
-To ask questions, report a bug, or discuss ideas for improvements or new features please visit the issues page, start a discussion, or submit a pull request.
-Load the package and the georgia
county mortality data set (ages 55-64, years 2014-2018):
library(geostan)
-data(georgia)
The sp_diag
function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map:
shape2mat(georgia, style = "B")
- A <-sp_diag(georgia$rate.female, georgia, w = A)
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
-#> Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
There are three censored observations in the georgia
female mortality data, which means there were 9 or fewer deaths in those counties. The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. By using the censor_point
argument we include our information on the censored observations to obtain results for all counties:
prep_car_data(A)
- cars <-#> Range of permissible rho values: -1.661134 1
- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
- fit <-censor_point = 9,
- data = georgia,
- car_parts = cars,
- family = poisson(),
- cores = 4, # for multi-core processing
- refresh = 0) # to silence some printing
- #>
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#> location scale
-#> 1 -4.7 5
-#>
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#> df location scale
-#> 1 10 0 3
-#>
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#> lower upper
-#> 1 -1.7 1
Passing a fitted model to the sp_diag
function will return a set of diagnostics for spatial models:
sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
-#> Warning: Removed 3 rows containing missing values
-#> (`geom_pointrange()`).
The print
method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean
, effective sample size n_eff
, and the R-hat statistic Rhat
):
print(fit)
-#> Spatial Model Results
-#> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome): CAR
-#> Likelihood function: poisson
-#> Link function: log
-#> Residual Moran Coefficient: 0.0021755
-#> WAIC: 1291.91
-#> Observations: 159
-#> Data models (ME): none
-#> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1;
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
-#>
-#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
-#> intercept -4.673 0.002 0.093 -4.843 -4.716 -4.674 -4.630 -4.497 1636 1.000
-#> car_rho 0.922 0.001 0.058 0.778 0.893 0.935 0.967 0.995 3214 1.001
-#> car_scale 0.458 0.001 0.035 0.393 0.433 0.455 0.481 0.532 3867 1.000
-#>
-#> Samples were drawn using NUTS(diag_e) at Wed Oct 4 12:20:45 2023.
-#> For each parameter, n_eff is a crude measure of effective sample size,
-#> and Rhat is the potential scale reduction factor on split chains (at
-#> convergence, Rhat=1).
More demonstrations can be found in the package help pages and vignettes.
- - - diff --git a/README.md b/README.md index 2ab76a71..775418be 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,9 @@ and model results. For demonstrations and discussion, see the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html) -on spatial autocorrelation, spatial measurement error models, and -spatial regression with raster layers. +on spatial autocorrelation, spatial measurement error models, spatial +regression with raster layers, and building custom spatial model in +Stan. The package is particularly suitable for public health research with spatial data, and complements the @@ -86,6 +87,7 @@ Load the package and the `georgia` county mortality data set (ages ``` r library(geostan) +#> This is geostan version 0.5.3 data(georgia) ``` @@ -95,7 +97,7 @@ including a histogram, Moran scatter plot, and map: ``` r A <- shape2mat(georgia, style = "B") sp_diag(georgia$rate.female, georgia, w = A) -#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat. +#> 3 NA values found in x will be dropped from data x and matrix w #> Warning: Removed 3 rows containing non-finite values (`stat_bin()`). ``` @@ -133,6 +135,12 @@ fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)), #> Distribution: uniform #> lower upper #> 1 -1.7 1 +#> Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable. +#> Running the chains for more iterations may help. See +#> https://mc-stan.org/misc/warnings.html#bulk-ess +#> Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable. +#> Running the chains for more iterations may help. See +#> https://mc-stan.org/misc/warnings.html#tail-ess ``` Passing a fitted model to the `sp_diag` function will return a set of @@ -141,9 +149,8 @@ diagnostics for spatial models: ``` r sp_diag(fit, georgia, w = A) #> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE. -#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat. -#> Warning: Removed 3 rows containing missing values -#> (`geom_pointrange()`). +#> 3 NA values found in x will be dropped from data x and matrix w +#> Warning: Removed 3 rows containing missing values (`geom_pointrange()`). ``` @@ -161,8 +168,8 @@ print(fit) #> Spatial method (outcome): CAR #> Likelihood function: poisson #> Link function: log -#> Residual Moran Coefficient: 0.0021755 -#> WAIC: 1291.91 +#> Residual Moran Coefficient: -0.0004015 +#> WAIC: 1290.5 #> Observations: 159 #> Data models (ME): none #> Inference for Stan model: foundation. @@ -170,11 +177,11 @@ print(fit) #> post-warmup draws per chain=1000, total post-warmup draws=4000. #> #> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat -#> intercept -4.673 0.002 0.093 -4.843 -4.716 -4.674 -4.630 -4.497 1636 1.000 -#> car_rho 0.922 0.001 0.058 0.778 0.893 0.935 0.967 0.995 3214 1.001 -#> car_scale 0.458 0.001 0.035 0.393 0.433 0.455 0.481 0.532 3867 1.000 +#> intercept -4.620 0.055 0.418 -4.851 -4.719 -4.674 -4.627 -4.334 57 1.074 +#> car_rho 0.926 0.001 0.057 0.783 0.896 0.937 0.968 0.999 1494 1.002 +#> car_scale 0.457 0.001 0.035 0.394 0.433 0.455 0.479 0.529 3559 1.000 #> -#> Samples were drawn using NUTS(diag_e) at Wed Oct 4 12:20:45 2023. +#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:14:50 2024. #> For each parameter, n_eff is a crude measure of effective sample size, #> and Rhat is the potential scale reduction factor on split chains (at #> convergence, Rhat=1). diff --git a/cran-comments.md~ b/cran-comments.md~ deleted file mode 100644 index 88d6be0e..00000000 --- a/cran-comments.md~ +++ /dev/null @@ -1,3 +0,0 @@ - - -Package was archived on CRAN due to gcc-UBSAN issues which stem from StanHeaders. StanHeaders has since been updated to fix this issue. diff --git a/docs/404.html b/docs/404.html index 7d80cb4f..f40cf104 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index dfd4bb61..99c4be3b 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 9298d5e7..665d0f87 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ @@ -50,6 +50,7 @@Version 3, 29 June 2007
Copyright © 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
“This License” refers to version 3 of the GNU General Public License.
“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.
“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.
@@ -79,7 +80,7 @@The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.
A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.
The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.
@@ -88,23 +89,23 @@All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.
When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work’s users, your or third parties’ legal rights to forbid circumvention of technological measures.
You may convey verbatim copies of the Program’s source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.
You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:
A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation’s users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.
You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:
“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:
@@ -158,24 +159,24 @@You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).
However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.
Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.
You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.
Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.
An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party’s predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.
A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor’s “contributor version”.
A contributor’s “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor’s essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.
@@ -186,30 +187,30 @@If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.
Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.
The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.
If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy’s public statement of acceptance of a version permanently authorizes you to choose that version for the Program.
Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found.
-<one line to give the program's name and a brief idea of what it does.>
-Copyright (C) <year> <name of author>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
<one line to give the program's name and a brief idea of what it does.>
+Copyright (C) <year> <name of author>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:
-<program> Copyright (C) <year> <name of author>
-for details type 'show w'.
- This program comes with ABSOLUTELY NO WARRANTY;
- This is free software, and you are welcome to redistribute it'show c' for details. under certain conditions; type
<program> Copyright (C) <year> <name of author>
+This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
+This is free software, and you are welcome to redistribute it
+under certain conditions; type 'show c' for details.
The hypothetical commands show w
and show c
should show the appropriate parts of the General Public License. Of course, your program’s commands might be different; for a GUI interface, you would use an “about box”.
You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/docs/articles/custom-spatial-models.html b/docs/articles/custom-spatial-models.html index 999e4e46..1825b657 100644 --- a/docs/articles/custom-spatial-models.html +++ b/docs/articles/custom-spatial-models.html @@ -40,7 +40,7 @@The spatial models in geostan use custom Stan functions that are far more efficient than using built-in functions, including the conditional (CAR) and simultaneous spatial autoregressive (SAR) models (both are particular specifications of the multivariate normal distribution). This vignette shows how you can use those functions together with some R functions in geostan to start building custom spatial models.
-This tutorial is written with the assumption that the reader is already familiar with RStan. Users are expected to make adjustments to the code as needed, including to prior distributions. For more details and an explanation of the computational approach, see Donegan (2022). For more background on spatial modeling see Haining and Li (2020).
+The spatial models in geostan use custom Stan functions that are far +more efficient than using built-in functions, including the conditional +(CAR) and simultaneous spatial autoregressive (SAR) models (both are +particular specifications of the multivariate normal distribution). This +vignette shows how you can use those functions together with some R +functions in geostan to start building custom spatial models.
+This tutorial is written with the assumption that the reader is +already familiar with RStan. Users are +expected to make adjustments to the code as needed, including to prior +distributions. For more details and an explanation of the computational +approach, see Donegan (2022). For more background on +spatial modeling see Haining and Li (2020).
The CAR model is a multivariate normal distribution with covariance matrix \[\Sigma = (I - \rho \cdot C)^{-1} M,\] where \(I\) is the identity matrix, \(\rho\) a spatial autocorrelation parameter which may be positive or negative, \(C\) is a sparse connectivity matrix, and \(M\) is a diagonal matrix with scale parameters \(\tau_i^2\). There is typically a single scale parameter \(\tau\) multiplied by weights \(\delta_i\). The term \(\tau^2 \cdot \delta_i\) is the conditional variance pertaining to \(y_i | y_{n(i)}\) where \(n(i)\) lists the areas that neighbor the \(i^{th}\) one.
-The defining feature of the CAR model is that, similar to time-series autoregression, the expectation for the value at the \(i^{th}\) site given any or all other values of \(y\) besides \(i\) (\(y_{-i}\)) are known, is a function of the neighboring values: \[\mathbb{E}[y_i | y_{-i}] = \mu_ + \sum_{j \in n(i)} \rho \cdot c_{ij} (y_j - \mu).\]
-The CAR function presented here is valid for what is sometimes called the WCAR specification. This is the most commonly employed CAR specification. The connectivity matrix is row-standardized. Let \(A\) be a symmetric binary connectivity matrix where \(a_{ij}= 1\) only if the \(i^{th}\) and \(j^{th}\) sites are neighbors, all other entries are zero (including the diagonal entries \(i=j\)). The elements of \(C\) are \[c_{ij} = \frac{a_{ij}}{\sum_j^n a_{ij}} = \frac{a_{ij}}{|n(i)|}.\]
-The diagonal of matrix \(M\) then is specified by the n-length vector of weights \(\delta_i = \tau^2 \cdot \frac{1}{|n(i)|}\).
-The geostan::prep_car_data
function will complete all of this for you. The user passes in a binary connectivity matrix \(A\) and and specifies style = "WCAR"
, then the function returns a list of data inputs for the Stan model.
The CAR model is a multivariate normal distribution with covariance +matrix \[\Sigma = (I - \rho \cdot C)^{-1} +M,\] where \(I\) is the identity +matrix, \(\rho\) a spatial +autocorrelation parameter which may be positive or negative, \(C\) is a sparse connectivity matrix, and +\(M\) is a diagonal matrix with scale +parameters \(\tau_i^2\). There is +typically a single scale parameter \(\tau\) multiplied by weights \(\delta_i\). The term \(\tau^2 \cdot \delta_i\) is the conditional +variance pertaining to \(y_i | +y_{n(i)}\) where \(n(i)\) lists +the areas that neighbor the \(i^{th}\) +one.
+The defining feature of the CAR model is that, similar to time-series +autoregression, the expectation for the value at the \(i^{th}\) site given any or all other values +of \(y\) besides \(i\) (\(y_{-i}\)) are known, is a function of the +neighboring values: \[\mathbb{E}[y_i | +y_{-i}] = \mu_ + \sum_{j \in n(i)} \rho \cdot c_{ij} (y_j - +\mu).\]
+The CAR function presented here is valid for what is sometimes called +the WCAR specification. This is the most commonly employed CAR +specification. The connectivity matrix is row-standardized. Let \(A\) be a symmetric binary connectivity +matrix where \(a_{ij}= 1\) only if the +\(i^{th}\) and \(j^{th}\) sites are neighbors, all other +entries are zero (including the diagonal entries \(i=j\)). The elements of \(C\) are \[c_{ij} += \frac{a_{ij}}{\sum_j^n a_{ij}} = \frac{a_{ij}}{|n(i)|}.\]
+The diagonal of matrix \(M\) then is +specified by the n-length vector of weights \(\delta_i = \tau^2 \cdot +\frac{1}{|n(i)|}\).
+The geostan::prep_car_data
function will complete all of
+this for you. The user passes in a binary connectivity matrix \(A\) and and specifies
+style = "WCAR"
, then the function returns a list of data
+inputs for the Stan model.
This first example is an autonormal model: \[y \sim Normal(\alpha + x \beta, \Sigma).\] Here you have \(n\) observations of a continuously measured outcome \(y\), possibly \(k=1\) or more covariates \(x\), and you’re accounting for spatial autocorrelation in \(y\) using the covariance matrix of an \(n\)-dimensional multivariate normal distribution.
-The following Stan code implements the above model; if you’re following along, copy the Stan code and save it in a file inside your working directory; I’ll name it “autonormal.stan” and I’ll save the name as an R variable:
+This first example is an autonormal model: \[y \sim Normal(\alpha + x \beta, \Sigma).\] +Here you have \(n\) observations of a +continuously measured outcome \(y\), +possibly \(k=1\) or more covariates +\(x\), and you’re accounting for +spatial autocorrelation in \(y\) using +the covariance matrix of an \(n\)-dimensional multivariate normal +distribution.
+The following Stan code implements the above model; if you’re +following along, copy the Stan code and save it in a file inside your +working directory; I’ll name it “autonormal.stan” and I’ll save the name +as an R variable:
autonormal_file <- "autonormal.stan"
Here’s the Stan code:
-functions {
-#include wcar-lpdf.stan
-}
-
-data {
- // data
- int<lower=1> n;
- int<lower=1> k;
- vector[n] y;
- matrix[n, k] x;
-
- // CAR parts
- int nAx_w;
- int nC;
- vector[nAx_w] Ax_w;
- array[nAx_w] int Ax_v;
- array[n + 1] int Ax_u;
- array[nC] int Cidx;
- vector[n] Delta_inv;
- real log_det_Delta_inv;
- vector[n] lambda;
-}
-
-parameters {
- // spatial autocorrelation (SA) parameter
- real<lower=1/min(lambda), upper=1/max(lambda)> rho;
-
- // scale parameter
- real<lower=0> tau;
-
- // intercept
- real alpha;
-
- // coefficients
- vector[k] beta;
-}
-
-model {
- vector[n] mu = alpha + x * beta;
-
- // Likelihood: y ~ Normal(Mu, Sigma)
- target += wcar_normal_lpdf(y |
- mu, tau, rho, // mean, scale, SA
- Ax_w, Ax_v, Ax_u, // stuff from prep_car_data
- Delta_inv,
- log_det_Delta_inv,
- lambda,
- n);
-
- // prior for scale parameter
- target += student_t_lpdf(tau | 10, 0, 5);
-
- // prior for beta
- target += normal_lpdf(beta | 0, 5);
-
- // prior for intercept
- target += normal_lpdf(alpha | 0, 5);
-}
-
-The input file “wcar-lpdf.stan” contains the following code (save this code inside your working directory and name it “wcar-lpdf.stan”):
-/**
- * Log probability density of the conditional autoregressive (CAR) model: WCAR specifications only
- *
- * @param y Process to model
- * @param mu Mean vector
- * @param tau Scale parameter
- * @param rho Spatial dependence parameter
- * @param A_w Sparse representation of the symmetric connectivity matrix, A
- * @param A_v Column indices for values in A_w
- * @param A_u Row starting indices for values in A_u
- * @param D_inv The row sums of A; i.e., the diagonal elements from the inverse of Delta, where M = Delta * tau^2 is a diagonal matrix containing the conditional variances.
- * @param log_det_D_inv Log determinant of Delta inverse.
- * @param lambda Eigenvalues of C (or of the symmetric, scaled matrix Delta^{-1/2}*C*Delta^{1/2}); for the WCAR specification, C is the row-standardized version of W.
- * @param n Length of y
- *
- * @return Log probability density of CAR prior up to additive constant
- */
-real wcar_normal_lpdf(vector y, vector mu,
- real tau, real rho,
- vector A_w, array[] int A_v, array[] int A_u,
- vector D_inv, real log_det_D_inv,
- vector lambda,
- int n) {
- vector[n] z = y - mu;
- real ztDz;
- real ztAz;
- vector[n] ldet_ImrhoC;
- ztDz = (z .* D_inv)' * z;
- ztAz = z' * csr_matrix_times_vector(n, n, A_w, A_v, A_u, z);
- for (i in 1:n) ldet_ImrhoC[i] = log1m(rho * lambda[i]);
- return 0.5 * (
- -n * log( 2 * pi() )
- -2 * n * log(tau)
- + log_det_D_inv
- + sum(ldet_ImrhoC)
- - (1 / tau^2) * (ztDz - rho * ztAz));
-}
-From R, you can use the following code to prepare the input data (\(y\), \(x\), etc.). I use prep_car_data
to get a list of parts for the CAR model, then I append the outcome data to the same list and pass it all to a Stan model to draw samples.
functions {
+#include wcar-lpdf.stan
+}
+
+data {
+ // data
+ int<lower=1> n;
+ int<lower=1> k;
+ vector[n] y;
+ matrix[n, k] x;
+
+ // CAR parts
+ int nAx_w;
+ int nC;
+ vector[nAx_w] Ax_w;
+ array[nAx_w] int Ax_v;
+ array[n + 1] int Ax_u;
+ array[nC] int Cidx;
+ vector[n] Delta_inv;
+ real log_det_Delta_inv;
+ vector[n] lambda;
+}
+
+parameters {
+ // spatial autocorrelation (SA) parameter
+ real<lower=1/min(lambda), upper=1/max(lambda)> rho;
+
+ // scale parameter
+ real<lower=0> tau;
+
+ // intercept
+ real alpha;
+
+ // coefficients
+ vector[k] beta;
+}
+
+model {
+ vector[n] mu = alpha + x * beta;
+
+ // Likelihood: y ~ Normal(Mu, Sigma)
+ target += wcar_normal_lpdf(y |
+ mu, tau, rho, // mean, scale, SA
+ Ax_w, Ax_v, Ax_u, // stuff from prep_car_data
+ Delta_inv,
+ log_det_Delta_inv,
+ lambda,
+ n);
+
+ // prior for scale parameter
+ target += student_t_lpdf(tau | 10, 0, 5);
+
+ // prior for beta
+ target += normal_lpdf(beta | 0, 5);
+
+ // prior for intercept
+ target += normal_lpdf(alpha | 0, 5);
+}
The input file “wcar-lpdf.stan” contains the following code (save +this code inside your working directory and name it +“wcar-lpdf.stan”):
+/**
+ * Log probability density of the conditional autoregressive (CAR) model: WCAR specifications only
+ *
+ * @param y Process to model
+ * @param mu Mean vector
+ * @param tau Scale parameter
+ * @param rho Spatial dependence parameter
+ * @param A_w Sparse representation of the symmetric connectivity matrix, A
+ * @param A_v Column indices for values in A_w
+ * @param A_u Row starting indices for values in A_u
+ * @param D_inv The row sums of A; i.e., the diagonal elements from the inverse of Delta, where M = Delta * tau^2 is a diagonal matrix containing the conditional variances.
+ * @param log_det_D_inv Log determinant of Delta inverse.
+ * @param lambda Eigenvalues of C (or of the symmetric, scaled matrix Delta^{-1/2}*C*Delta^{1/2}); for the WCAR specification, C is the row-standardized version of W.
+ * @param n Length of y
+ *
+ * @return Log probability density of CAR prior up to additive constant
+ */
+real wcar_normal_lpdf(vector y, vector mu,
+ real tau, real rho,
+ vector A_w, array[] int A_v, array[] int A_u,
+ vector D_inv, real log_det_D_inv,
+ vector lambda,
+ int n) {
+ vector[n] z = y - mu;
+ real ztDz;
+ real ztAz;
+ vector[n] ldet_ImrhoC;
+ ztDz = (z .* D_inv)' * z;
+ ztAz = z' * csr_matrix_times_vector(n, n, A_w, A_v, A_u, z);
+ for (i in 1:n) ldet_ImrhoC[i] = log1m(rho * lambda[i]);
+ return 0.5 * (
+ -n * log( 2 * pi() )
+ -2 * n * log(tau)
+ + log_det_D_inv
+ + sum(ldet_ImrhoC)
+ - (1 / tau^2) * (ztDz - rho * ztAz));
+}
From R, you can use the following code to prepare the input data
+(\(y\), \(x\), etc.). I use
+prep_car_data
to get a list of parts for the CAR model,
+then I append the outcome data to the same list and pass it all to a
+Stan model to draw samples.
library(rstan)
library(geostan)
@@ -222,7 +276,8 @@ Autonormal model
# sample from model
samples <- sampling(car_model, data = car_list, iter = 1e3)
The same results can be obtained using geostan::stan_car
:
The same results can be obtained using
+geostan::stan_car
:
fit <- stan_car(log(income / 10e3) ~ log(population / 10e3),
data = georgia, car = car_list, iter = 1e3)
Disease mapping is a common use for CAR models, though these models have many applications. In this class of models, the CAR model is assigned as the prior distribution to a parameter vector \(\phi\), which is used to model disease incidence rates across small areas like counties. The disease data consist of counts \(y\) together with the size of population at risk \(p\), or possibly a different denominator such as the expected number of cases \(E\) (which occurs when using indirect age-standardization). These models have the form \[\begin{equation}
+ Disease mapping is a common use for CAR models, though these models
+have many applications. In this class of models, the CAR model is
+assigned as the prior distribution to a parameter vector \(\phi\), which is used to model disease
+incidence rates across small areas like counties. The disease data
+consist of counts \(y\) together with
+the size of population at risk \(p\),
+or possibly a different denominator such as the expected number of cases
+\(E\) (which occurs when using indirect
+age-standardization). These models have the form \[\begin{equation}
\begin{aligned}
&y \sim Poisson(e^\phi) \\
&\phi \sim Normal(\mu, \Sigma) \\
@@ -245,84 +308,103 @@ It is possible to build a Stan modeling using either one of these two formulations. They are substantively equivalent, but you may find that one is far more amenable to Stan’s MCMC algorithm. The former specification (\(\mu\) inside the CAR model) is less commonly presented in papers, but in this author’s experience it is often far more stable computationally than forcing the CAR model to have zero mean. (You may very well encounter a case where the opposite is true.) The purpose of the offset term is sometimes unclear in introductory texts. The offset term \(p\) is from the denominator of the rates \(\frac{y}{p}\). The expectation or mean of the model is \[\begin{equation}
+ It is possible to build a Stan modeling using either one of these two
+formulations. They are substantively equivalent, but you may
+find that one is far more amenable to Stan’s MCMC algorithm. The former
+specification (\(\mu\) inside the CAR
+model) is less commonly presented in papers, but in this author’s
+experience it is often far more stable computationally than forcing the
+CAR model to have zero mean. (You may very well encounter a case where
+the opposite is true.) The purpose of the offset term is sometimes unclear in introductory
+texts. The offset term \(p\) is from
+the denominator of the rates \(\frac{y}{p}\). The expectation or mean of
+the model is \[\begin{equation}
\begin{aligned}
- &\mathbb{E}[y] = e^{log(p) + \mu} = e^{log(p)} \cdot e^{\mu} = p \cdot e^\mu \\
+ &\mathbb{E}[y] = e^{log(p) + \mu} = e^{log(p)} \cdot e^{\mu} = p
+\cdot e^\mu \\
&\mathbb{E}\Bigl[ \frac{y}{p} \Bigr] = e^\mu
\end{aligned}
-\end{equation}\] The smaller is the denominator, the less informative is the rate with respect to the characteristic level of risk bearing upon the population of the given period and place. With small denominators, chance renders the rates uninformative (and unstable over time and space), and the Poisson model accounts for this. The following Stan code provides a template for a simple disease mapping model using the CAR model as a prior for \(\phi\). As before, you can save the code in your working directory and give it a name like “car_poisson.stan”. I’ll store the file name in my R environment: The following Stan code provides a template for a simple disease
+mapping model using the CAR model as a prior for \(\phi\). As before, you can save the code in
+your working directory and give it a name like “car_poisson.stan”. I’ll
+store the file name in my R environment: Again, you can use Again, you can use The same results can be obtained using The same results can be obtained using
+ The simultaneously-specified spatial autoregression (SAR) is written as \[\begin{equation}
+ The simultaneously-specified spatial autoregression (SAR) is written
+as \[\begin{equation}
\begin{aligned}
y = \mu + (I - \rho \cdot W)^{-1} \epsilon \\
\epsilon \sim Normal(0, \sigma^2 \cdot I)
\end{aligned}
\end{equation}\] The Stan function that is used by The following Stan model provides an example of how to use this function to build a SAR model. Again, its an autonormal model for continuous outcome variable with \(k\) covariates. The Stan function that is used by The following Stan model provides an example of how to use this
+function to build a SAR model. Again, its an autonormal model for
+continuous outcome variable with \(k\)
+covariates. We can draw samples from the same model using We can draw samples from the same model using
+ One can also use the SAR model as a prior distribution for a parameter vector, just as was done above with the CAR model. One can also use the SAR model as a prior distribution for a
+parameter vector, just as was done above with the CAR model.Poisson models
car_poisson_file <- "car_poisson.stan"
-functions {
-#include wcar-lpdf.stan
-}
-
-data {
- // data
- int<lower=1> n;
- int<lower=1> k;
- array[n] int<lower=0> y;
- matrix[n, k] x;
- vector[n] const_offset;
-
- // CAR parts
- int nAx_w;
- int nC;
- vector[nAx_w] Ax_w;
- array[nAx_w] int Ax_v;
- array[n + 1] int Ax_u;
- array[nC] int Cidx;
- vector[n] Delta_inv;
- real log_det_Delta_inv;
- vector[n] lambda;
-}
-
-parameters {
- // spatial autocorrelation (SA) parameter
- real<lower=1/min(lambda), upper=1/max(lambda)> rho;
-
- // scale parameter
- real<lower=0> tau;
-
- // intercept
- real alpha;
-
- // coefficients
- vector[k] beta;
-
- // SA trend component
- vector[n] phi;
-}
-
-
-model {
- vector[n] mu = const_offset + alpha + x * beta;
-
- // Likelihood: y ~ Poisson(e^[phi])
- target += poisson_lpmf(y | exp(phi));
-
- // phi ~ Normal(Mu, Sigma)
- target += wcar_normal_lpdf(phi |
- mu, tau, rho, // mean, scale, SA
- Ax_w, Ax_v, Ax_u, // stuff from prep_car_data
- Delta_inv,
- log_det_Delta_inv,
- lambda,
- n);
-
- // prior for scale parameter
- target += student_t_lpdf(tau | 10, 0, 1);
-
- // prior for beta
- target += normal_lpdf(beta | 0, 5);
-
- // prior for intercept
- target += normal_lpdf(alpha | 0, 5);
-}
-
geostan::prep_car_data
to easily convert the spatial weights matrix into the list of required inputs for the CAR model:functions {
+#include wcar-lpdf.stan
+}
+
+data {
+ // data
+ int<lower=1> n;
+ int<lower=1> k;
+ array[n] int<lower=0> y;
+ matrix[n, k] x;
+ vector[n] const_offset;
+
+ // CAR parts
+ int nAx_w;
+ int nC;
+ vector[nAx_w] Ax_w;
+ array[nAx_w] int Ax_v;
+ array[n + 1] int Ax_u;
+ array[nC] int Cidx;
+ vector[n] Delta_inv;
+ real log_det_Delta_inv;
+ vector[n] lambda;
+}
+
+parameters {
+ // spatial autocorrelation (SA) parameter
+ real<lower=1/min(lambda), upper=1/max(lambda)> rho;
+
+ // scale parameter
+ real<lower=0> tau;
+
+ // intercept
+ real alpha;
+
+ // coefficients
+ vector[k] beta;
+
+ // SA trend component
+ vector[n] phi;
+}
+
+
+model {
+ vector[n] mu = const_offset + alpha + x * beta;
+
+ // Likelihood: y ~ Poisson(e^[phi])
+ target += poisson_lpmf(y | exp(phi));
+
+ // phi ~ Normal(Mu, Sigma)
+ target += wcar_normal_lpdf(phi |
+ mu, tau, rho, // mean, scale, SA
+ Ax_w, Ax_v, Ax_u, // stuff from prep_car_data
+ Delta_inv,
+ log_det_Delta_inv,
+ lambda,
+ n);
+
+ // prior for scale parameter
+ target += student_t_lpdf(tau | 10, 0, 1);
+
+ // prior for beta
+ target += normal_lpdf(beta | 0, 5);
+
+ // prior for intercept
+ target += normal_lpdf(alpha | 0, 5);
+}
geostan::prep_car_data
to easily
+convert the spatial weights matrix into the list of required inputs for
+the CAR model:
library(rstan)
library(geostan)
@@ -344,7 +426,8 @@
Poisson modelssamples <- sampling(car_poisson,
data = car_list,
iter = 1e3)
geostan::stan_car
:geostan::stan_car
:
fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + log(income / 10e3),
data = georgia, car = car_list, family = poisson(), iter = 1e3)
Poisson models
SAR models
-
-where \(W\) is a row-standardized spatial weights matrix, \(I\) is the n-by-n identity matrix, and \(\rho\) is a spatial autocorrelation parameter. This is also a multivariate normal distribution but with a different covariance matrix than the CAR model: \[\begin{equation}
- \Sigma = \sigma^2 \cdot (I - \rho \cdot W)^{-1} (I - \rho \cdot W^T)^{-1},
-\end{equation}\] where \(T\) is the matrix transpose operator. The SA parameter \(\rho\) for the SAR model has a more intuitive connection to the degree of SA than the CAR model (which is usually above .95 for moderately high SA).geostan::stan_sar
is as follows (the R code below will assume that you have saved this as “sar-lpdf.stan”):
-/**
- * Log probability density of the simultaneous autoregressive (SAR) model (spatial error model)
- *
- * @param y Process to model
- * @param mu Mean vector
- * @param sigma Scale parameter
- * @param rho Spatial dependence parameter
- * @param ImW Sparse representation of (I - W): non-zero values only
- * @param ImW_v Column indices for values in ImW
- * @param ImW_u Row starting indices for values in ImW
- * @param Widx Indices for the off-diagonal elements in ImC
- * @param lambda Eigenvalues of W
- * @param n Length of y
- *
- * @return Log probability density of SAR model up to additive constant
-*/
- real sar_normal_lpdf(vector y,
- vector mu,
- real sigma,
- real rho,
- vector ImW,
- array[] int ImW_v,
- array[] int ImW_u,
- array[] int Widx,
- vector lambda,
- int n) {
- vector[n] z = y - mu;
- real tau = 1 / sigma^2;
- vector[num_elements(ImW)] ImrhoW = ImW; // (I - rho W)
- vector[n] ImrhoWz; // (I - rho * W) * z
- real zVz;
- real ldet_V = 2 * sum(log1m(rho * lambda)) - 2 * n * log(sigma);
- ImrhoW[Widx] = rho * ImW[Widx];
- ImrhoWz = csr_matrix_times_vector(n, n, ImrhoW, ImW_v , ImW_u , z);
- zVz = tau * dot_self(ImrhoWz);
- return 0.5 * (-n * log(2 * pi()) + ldet_V - zVz);
- }
-
geostan::stan_sar
is
+as follows (the R code below will assume that you have saved this as
+“sar-lpdf.stan”):/**
+ * Log probability density of the simultaneous autoregressive (SAR) model (spatial error model)
+ *
+ * @param y Process to model
+ * @param mu Mean vector
+ * @param sigma Scale parameter
+ * @param rho Spatial dependence parameter
+ * @param ImW Sparse representation of (I - W): non-zero values only
+ * @param ImW_v Column indices for values in ImW
+ * @param ImW_u Row starting indices for values in ImW
+ * @param Widx Indices for the off-diagonal elements in ImC
+ * @param lambda Eigenvalues of W
+ * @param n Length of y
+ *
+ * @return Log probability density of SAR model up to additive constant
+*/
+ real sar_normal_lpdf(vector y,
+ vector mu,
+ real sigma,
+ real rho,
+ vector ImW,
+ array[] int ImW_v,
+ array[] int ImW_u,
+ array[] int Widx,
+ vector lambda,
+ int n) {
+ vector[n] z = y - mu;
+ real tau = 1 / sigma^2;
+ vector[num_elements(ImW)] ImrhoW = ImW; // (I - rho W)
+ vector[n] ImrhoWz; // (I - rho * W) * z
+ real zVz;
+ real ldet_V = 2 * sum(log1m(rho * lambda)) - 2 * n * log(sigma);
+ ImrhoW[Widx] = rho * ImW[Widx];
+ ImrhoWz = csr_matrix_times_vector(n, n, ImrhoW, ImW_v , ImW_u , z);
+ zVz = tau * dot_self(ImrhoWz);
+ return 0.5 * (-n * log(2 * pi()) + ldet_V - zVz);
+ }
sar_model_file <- "sar_model.stan"
+functions {
-#include sar-lpdf.stan
-}
-
-data {
- // data
- int<lower=1> n;
- int<lower=1> k;
- vector[n] y;
- matrix[n, k] x;
-
-
-// SAR
- int nImW_w;
- int nW;
- vector[nImW_w] ImW_w;
- array[nImW_w] int ImW_v;
- array[n + 1] int ImW_u;
- array[nW] int Widx;
- vector[n] eigenvalues_w;
-}
-
-parameters {
- // SA parameter
- real<lower=1/min(eigenvalues_w), upper=1/max(eigenvalues_w)> rho;
-
- // scale parameter
- real<lower=0> sigma;
-
- // intercept
- real alpha;
-
- // coefficients
- vector[k] beta;
-}
-
-model{
- vector[n] mu = alpha + x * beta;
-
- // Likelihood: Y ~ Normal(Mu, Sigma)
- target += sar_normal_lpdf(y |
- mu, sigma, rho,
- ImW_w,
- ImW_v,
- ImW_u,
- Widx,
- eigenvalues_w,
- n);
-
- // prior for scale parameter
- target += student_t_lpdf(sigma | 10, 0, 5);
-
- // prior for beta
- target += normal_lpdf(beta | 0, 5);
-
- // prior for intercept
- target += normal_lpdf(alpha | 0, 5);
-}
-
functions {
+#include sar-lpdf.stan
+}
+
+data {
+ // data
+ int<lower=1> n;
+ int<lower=1> k;
+ vector[n] y;
+ matrix[n, k] x;
+
+
+// SAR
+ int nImW_w;
+ int nW;
+ vector[nImW_w] ImW_w;
+ array[nImW_w] int ImW_v;
+ array[n + 1] int ImW_u;
+ array[nW] int Widx;
+ vector[n] eigenvalues_w;
+}
+
+parameters {
+ // SA parameter
+ real<lower=1/min(eigenvalues_w), upper=1/max(eigenvalues_w)> rho;
+
+ // scale parameter
+ real<lower=0> sigma;
+
+ // intercept
+ real alpha;
+
+ // coefficients
+ vector[k] beta;
+}
+
+model{
+ vector[n] mu = alpha + x * beta;
+
+ // Likelihood: Y ~ Normal(Mu, Sigma)
+ target += sar_normal_lpdf(y |
+ mu, sigma, rho,
+ ImW_w,
+ ImW_v,
+ ImW_u,
+ Widx,
+ eigenvalues_w,
+ n);
+
+ // prior for scale parameter
+ target += student_t_lpdf(sigma | 10, 0, 5);
+
+ // prior for beta
+ target += normal_lpdf(beta | 0, 5);
+
+ // prior for intercept
+ target += normal_lpdf(alpha | 0, 5);
+}
library(geostan)
library(rstan)
@@ -481,22 +577,27 @@
SAR models# sample from model
samples <- sampling(sar_model, data = sar_list, iter = 1e3)
geostan::stan_sar
:geostan::stan_sar
:
sar_dl <- prep_sar_data(W)
fit <- stan_sar(log(income / 10e3) ~ log(population / 10e3), data = georgia,
sar_parts = sar_dl, iter = 1e3)
Donegan, Connor. 2022. “Building Spatial Conditional Autoregressive Models in the Stan Programming Language.” OSF Preprints. https://doi.org/10.31219/osf.io/3ey65.
+Haining, Robert P., and Guangquan Li. 2020. Modelling Spatial and Spatio-Temporal Data: A Bayesian Approach. CRC Press.
+This vignette walks through exploratory spatial data analysis (ESDA) functionality in the geostan package, which includes methods for measuring and visualizing global and local spatial autocorrelation (SA). It includes a short review of the philosophy of ESDA, and ends with a set of diagnostic plots for spatial models.
+This vignette walks through exploratory spatial data analysis (ESDA) +functionality in the geostan package, which includes +methods for measuring and visualizing global and local spatial +autocorrelation (SA). It includes a short review of the philosophy of +ESDA, and ends with a set of diagnostic plots for spatial models.
From the R console, load geostan and the georgia
data set.
From the R console, load geostan and the
+georgia
data set.
georgia
is a simple features (sf
) object with estimates of county population characteristics from the American Community Survey (ACS) for the five year period spanning 2014-2018. Their corresponding standard errors are also here. The column college
contains ACS estimates for the percent of the population age 25 and older that has obtained a college degree or higher; the standard errors of the survey estimates are in the column named college.se
.
georgia
is a simple features (sf
) object
+with estimates of county population characteristics from the American
+Community Survey (ACS) for the five year period spanning 2014-2018.
+Their corresponding standard errors are also here. The column
+college
contains ACS estimates for the percent of the
+population age 25 and older that has obtained a college degree or
+higher; the standard errors of the survey estimates are in the column
+named college.se
.
Good (1983) contrasts exploratory data analysis (EDA) with confirmatory analysis as follows:
+Good (1983) contrasts exploratory data +analysis (EDA) with confirmatory analysis as follows:
--Confirmatory statistics consists of experimental design, significance testing, estimation, and prediction, whereas EDA is concerned mainly with the encouragement of hypothesis formulation. (284)
+Confirmatory statistics consists of experimental design, significance +testing, estimation, and prediction, whereas EDA is concerned mainly +with the encouragement of hypothesis formulation. (284)
EDA techniques include all sorts of visual methods (histograms, scatter plots, etc.) that take a set of data, extract a limited summary of it (suppressing most other information), and then return a visual depiction that brings certain dimensions of (non-) variation to the foreground. As Good (1983) continues, “techniques of descriptive statistics are designed to match the salient features of the data set to human cognitive abilities” (287). EDA is also iterative: given some model has been built (or a hypothesis is entertained), one starts the process again by examining residuals and other features of the model. EDA thus suggests a process for model criticism and improvement by way successive approximation (see Gabry et al. 2019).
-Exploratory spatial data analysis (ESDA) (Koschinsky 2020) includes all of this but also involves techniques that are of particular importance for spatially referenced data. ESDA involves tools for visualizing (residual) spatial patterns, decomposing spatial patterns into different elements across the map, and measuring the extent of spatial patterning in the data. Helpful ESDA tools draw out different spatial patterns from the data and communicate them in ways that are comprehensible to the human mind. In some cases, ESDA can help to contextualize data points. If ESDA draws attention to certain data points—for example, select observations that are very different from surrounding values—then locating them geographically may help to understand them as specific cases, as opposed to quantitative abstractions.
-Tukey’s Exploratory Data Analysis (Tukey 1993) also offers principles for EDA and favors an approach to EDA that generally eschews hypothesis testing.
+EDA techniques include all sorts of visual methods (histograms, +scatter plots, etc.) that take a set of data, extract a limited summary +of it (suppressing most other information), and then return a visual +depiction that brings certain dimensions of (non-) variation to the +foreground. As Good (1983) continues, “techniques of +descriptive statistics are designed to match the salient features of the +data set to human cognitive abilities” (287). EDA is also iterative: +given some model has been built (or a hypothesis is entertained), one +starts the process again by examining residuals and other features of +the model. EDA thus suggests a process for model criticism and +improvement by way successive approximation (see +Gabry et al. 2019).
+Exploratory spatial data analysis (ESDA) (Koschinsky 2020) includes all of +this but also involves techniques that are of particular importance for +spatially referenced data. ESDA involves tools for visualizing +(residual) spatial patterns, decomposing spatial patterns into different +elements across the map, and measuring the extent of spatial patterning +in the data. Helpful ESDA tools draw out different spatial patterns from +the data and communicate them in ways that are comprehensible to the +human mind. In some cases, ESDA can help to contextualize data +points. If ESDA draws attention to certain data points—for example, +select observations that are very different from surrounding values—then +locating them geographically may help to understand them as specific +cases, as opposed to quantitative abstractions.
+Tukey’s Exploratory Data Analysis (Tukey 1993) also offers principles for +EDA and favors an approach to EDA that generally eschews hypothesis +testing.
If we pass any variable x
and its corresponding spatial object (e.g., simple features) to the sp_diag
function, it returns a histogram, Moran scatter plot, and map of the estimates:
If we pass any variable x
and its corresponding spatial
+object (e.g., simple features) to the sp_diag
function, it
+returns a histogram, Moran scatter plot, and map of the estimates:
sp_diag(georgia$college, georgia, name = "College (%)")
The Moran plot is a visualization of the degree of SA: on the horizontal axis are the college
estimates while the vertical axis represents the mean neighboring value. The Moran coefficient, an index of SA, is printed at the top (MC=0.42) The expected value of the MC under no SA is \(-1/(n-1)\) (Chun and Griffith 2013). The map shows that the contrast between the greater Atlanta metropolitan area and the rural counties is a prominent, if not the predominant, spatial pattern.
The Moran plot is a visualization of the degree of SA: on the
+horizontal axis are the college
estimates while the
+vertical axis represents the mean neighboring value. The Moran
+coefficient, an index of SA, is printed at the top (MC=0.42) The
+expected value of the MC under no SA is \(-1/(n-1)\) (Chun and Griffith 2013). The map shows
+that the contrast between the greater Atlanta metropolitan area and the
+rural counties is a prominent, if not the predominant, spatial
+pattern.
To summarize spatial information for the purpose of data analysis, we use an \(N \times N\) matrix, which we will call \(W\). Given some focal point/area on the map, the term “neighbor” will refer to any other area that is considered geographically ‘near’. The meaning is context specific, but with areal data the most common choice is to say that all areas whose borders touch each other are neighbors. Sometimes a distance threshold is used.
-The value stored in the element \(w_{ij}\) expresses the degree of connectivity between the \(i^{th}\) and \(j^{th}\) areas. The diagonal elements of matrix \(W\) are all zero (no unit is its own neighbor), and all pairs of observations that are not neighbors also have weights of zero. There are a variety of ways to assign weights to neighbors. The binary coding scheme assigns neighbors a weight of one. In the row-standardized scheme, all neighbors of the \(i^{th}\) area have a weight of \(w_{ij} = \frac{1}{\delta_i}\) where \(\delta_i\) is the number of areas that neighbor the \(i^{th}\) area. Weights can also be assigned by taking the inverse of the distance between neighboring areas.
-The shape2mat
function takes a spatial object (simple features or spatial polygons) and creates a sparse matrix representation of the neighborhood structure.1 For a row-standardized coding scheme, we use style = "W"
.
To summarize spatial information for the purpose of data analysis, we +use an \(N \times N\) matrix, which we +will call \(W\). Given some focal +point/area on the map, the term “neighbor” will refer to any other area +that is considered geographically ‘near’. The meaning is context +specific, but with areal data the most common choice is to say that all +areas whose borders touch each other are neighbors. Sometimes a distance +threshold is used.
+The value stored in the element \(w_{ij}\) expresses the degree of +connectivity between the \(i^{th}\) and +\(j^{th}\) areas. The diagonal elements +of matrix \(W\) are all zero (no unit +is its own neighbor), and all pairs of observations that are not +neighbors also have weights of zero. There are a variety of ways to +assign weights to neighbors. The binary coding scheme assigns neighbors +a weight of one. In the row-standardized scheme, all neighbors of the +\(i^{th}\) area have a weight of \(w_{ij} = \frac{1}{\delta_i}\) where \(\delta_i\) is the number of areas that +neighbor the \(i^{th}\) area. Weights +can also be assigned by taking the inverse of the distance between +neighboring areas.
+The shape2mat
function takes a spatial object (simple
+features or spatial polygons) and creates a sparse matrix representation
+of the neighborhood structure.1 For a row-standardized coding scheme, we
+use style = "W"
.
W <- shape2mat(georgia, style = "W")
This function uses the spdep::poly2nb
function to create the neighborhood structure.
This function uses the spdep::poly2nb
function to create
+the neighborhood structure.
We can create a Moran scatter plot using the moran_plot
function. To reproduce the Moran plot given by sp_diag
, we need to use the row-standardized spatial weights matrix:
We can create a Moran scatter plot using the moran_plot
+function. To reproduce the Moran plot given by sp_diag
, we
+need to use the row-standardized spatial weights matrix:
moran_plot(georgia$college, W)
Similarly, we can calculate the Moran coefficient using mc
:
Similarly, we can calculate the Moran coefficient using
+mc
:
mc(georgia$college, W)
#> [1] 0.422
Under particular conditions (the variable has been centered by subtracting its own mean from each value, and a row-standardized weights matrix is used), the Moran coefficient is equivalent to the slope of the regression line on the Moran plot.
-If we use a binary spatial weights matrix (style = "B"
), the vertical axis will show the sum of surrounding values:
Under particular conditions (the variable has been centered by +subtracting its own mean from each value, and a row-standardized weights +matrix is used), the Moran coefficient is equivalent to the slope of the +regression line on the Moran plot.
+If we use a binary spatial weights matrix (style = "B"
),
+the vertical axis will show the sum of surrounding values:
A <- shape2mat(georgia, "B")
moran_plot(georgia$college, A)
When a binary matrix is used, counties with more neighbors contribute more to the MC than counties with fewer neighbors.
-The quadrants of the Moran plot are helpful for classifying observations. The first (top right) quadrant represents counties with above-average values that are also surrounded by above-average values; the third (bottom left) quadrant contains low values surrounded by low values. Points in the first and third quadrants represent positive SA. The second (top left) and fourth (bottom right) quadrants represent negative SA: they contain spatial outliers, which are dissimilar from their neighbors.
+When a binary matrix is used, counties with more neighbors contribute +more to the MC than counties with fewer neighbors.
+The quadrants of the Moran plot are helpful for classifying +observations. The first (top right) quadrant represents counties with +above-average values that are also surrounded by above-average values; +the third (bottom left) quadrant contains low values surrounded by low +values. Points in the first and third quadrants represent +positive SA. The second (top left) and fourth (bottom right) +quadrants represent negative SA: they contain spatial outliers, +which are dissimilar from their neighbors.
The formula for the Moran coefficient (MC), an index of SA, is \[MC = \frac{n}{K}\frac{\sum_i^n \sum_j^n w_{ij} (y_i - \overline{y})(y_j - \overline{y})}{\sum_i^n (y_i - \overline{y})^2} \] where \(K\) is the sum of all values in the spatial connectivity matrix \(W\) (i.e., the sum of all row-sums: \(K = \sum_i \sum_j w_{ij}\)).
-The MC is related to the Pearson product moment correlation coefficient. (Chun and Griffith 2013, 10). Whereas the correlation coefficient ranges from -1 to 1, the MC does not, except under special circumstances—the range usually expands somewhat. Its expected value under a condition of zero SA is not zero, but instead, \(-1/(n-1)\) (which approaches zero as \(n\) increases).
-It is also important to understand that the MC does not ‘behave’ similarly to the correlation coefficient. Specifically, for moderate to high levels of autocorrelation, the MC may not be highly sensitive to changes in the degree of autocorrelation; it is more powerful for detecting the presence of autocorrelation. This issue motivated development of an approximate estimator of the spatial autocorrelation parameter from the simultaneous autoregressive (SAR) model, called APLE (approximate-profile likelihood estimator) (Li, Calder, and Cressie 2007). While it is limited to smaller data sets, it is another helpful tool (see ?geostan::aple
).
The formula for the Moran coefficient (MC), an index of SA, is \[MC = \frac{n}{K}\frac{\sum_i^n \sum_j^n w_{ij} +(y_i - \overline{y})(y_j - \overline{y})}{\sum_i^n (y_i - +\overline{y})^2} \] where \(K\) +is the sum of all values in the spatial connectivity matrix \(W\) (i.e., the sum of all row-sums: \(K = \sum_i \sum_j w_{ij}\)).
+The MC is related to the Pearson product moment correlation +coefficient. (Chun and +Griffith 2013, 10). Whereas the correlation coefficient +ranges from -1 to 1, the MC does not, except under special +circumstances—the range usually expands somewhat. Its expected value +under a condition of zero SA is not zero, but instead, \(-1/(n-1)\) (which approaches zero as \(n\) increases).
+It is also important to understand that the MC does not ‘behave’
+similarly to the correlation coefficient. Specifically, for moderate to
+high levels of autocorrelation, the MC may not be highly sensitive to
+changes in the degree of autocorrelation; it is more powerful for
+detecting the presence of autocorrelation. This issue motivated
+development of an approximate estimator of the spatial autocorrelation
+parameter from the simultaneous autoregressive (SAR) model, called APLE
+(approximate-profile likelihood estimator) (Li, Calder, and Cressie 2007). While it
+is limited to smaller data sets, it is another helpful tool (see
+?geostan::aple
).
The formula for the Geary ratio (GR, or Geary’s C), another index of SA, is \[GR = \frac{(n-1)}{2K} \frac{\sum_i^n \sum_j^n w_{ij} (y_i - y_j)^2 }{\sum_i (y_i - \overline{y})^2}\] The weighted sum in the numerator of the GR contains the squared differences between each observation and all of their respective neighbors. This means that local outliers and negative SA will cause the numerator to increase. By contrast, positive SA causes the numerator to become smaller. The denominator in the GR is the total sum of squared deviations from the mean of \(y\). This means that if there is no SA at all, the GR has an expected value of 1. As the degree of positive SA increases, the GR decreases towards zero; negative SA causes the GR to increase beyond 1. (\(1-GR\) is perhaps a more intuitive value (Unwin 1996).)
-The GR and the MC complement each other mathematically (Chun and Griffith 2013, 12): the GR can be re-written as \[GR = \mathcal{G} - \frac{n-1}{n}MC\] where \[\mathcal{G}=\frac{n-1}{2K} \frac{\sum_i^n (y_i - \overline{y})^2 \big(\sum_j^n w_{ij}\big)}{\sum_i^n (y_i - \overline{y})^2}\] The GR is more sensitive than the MC to negative SA, and, as such, it is sensitive to local outliers. When a variable does not contain influential local outliers, the sum of its GR and MC values will be equal to about 1.
-The following code calculates the MC and GR for the Georgia county estimates of percent college educated. Both of them indicate a moderate degree of positive SA:
+The formula for the Geary ratio (GR, or Geary’s C), another index of +SA, is \[GR = \frac{(n-1)}{2K} \frac{\sum_i^n +\sum_j^n w_{ij} (y_i - y_j)^2 }{\sum_i (y_i - \overline{y})^2}\] +The weighted sum in the numerator of the GR contains the squared +differences between each observation and all of their respective +neighbors. This means that local outliers and negative SA will cause the +numerator to increase. By contrast, positive SA causes the numerator to +become smaller. The denominator in the GR is the total sum of squared +deviations from the mean of \(y\). This +means that if there is no SA at all, the GR has an expected value of 1. +As the degree of positive SA increases, the GR decreases towards zero; +negative SA causes the GR to increase beyond 1. (\(1-GR\) is perhaps a more intuitive value +(Unwin +1996).)
+The GR and the MC complement each other mathematically (Chun and Griffith 2013, +12): the GR can be re-written as \[GR = \mathcal{G} - \frac{n-1}{n}MC\] where +\[\mathcal{G}=\frac{n-1}{2K} \frac{\sum_i^n +(y_i - \overline{y})^2 \big(\sum_j^n w_{ij}\big)}{\sum_i^n (y_i - +\overline{y})^2}\] The GR is more sensitive than the MC to +negative SA, and, as such, it is sensitive to local outliers. When a +variable does not contain influential local outliers, the sum of its GR +and MC values will be equal to about 1.
+The following code calculates the MC and GR for the Georgia county +estimates of percent college educated. Both of them indicate a moderate +degree of positive SA:
x <- georgia$college
W <- shape2mat(georgia, "W")
@@ -175,12 +299,40 @@ The Geary ratio (GR)
Local indicators of spatial association (LISAs)
-Local indicators of spatial association (LISA) were introduced by Anselin (1995) for ESDA. The LISA is a class of local SA statistics. Every LISA is related to a global SA index, such as the MC and GR. geostan contains functions for calculating local Geary’s C and a local version of the MC, known as local Moran’s I.
-The idea of a local statistic is to index the amount and type of SA present at every discrete location on the map (e.g., for every county). The sum of all these LISA values is proportionate to its corresponding global SA index.
-Local Moran’s I for the \(i^{th}\) unit is \[I_i = z_i \sum_j^n w_{ij} z_j\] where \(z\) is the original variable in deviation form (\(z_i = y_i - \overline{y}\)), and \(W\) is the spatial connectivity matrix. These values are related to the Moran scatter plot: positive \(I_i\) values indicate positive SA, corresponding to quadrants I and III of the Moran scatter plot. Negative \(I_i\) values indicate negative SA, which appear in quadrants II and IV of the Moran scatter plot.
-The local Geary statistic \(C_i\) is the weighted sum of squared differences found in the numerator in the GR: \[C_i = \sum_j^n w_{ij} (y_i - y_j)^2 \]
-The local Geary statistic is very sensitive to local outliers (negative SA), more so than local Moran’s I. This makes these two LISAs complementary: \(I_i\) is most useful to visualizing clustering behavior, whereas \(C_i\) is useful for visualizing local outliers. By default, geostan will first scale \(y\) using z <- scale(y, center = TRUE, scale = TRUE)
before calculating either of these LISAs.
-The lisa
function returns local Moran’s I values and indicates which quadrant of the Moran plot the point is found:
+Local indicators of spatial association (LISA) were introduced by
+Anselin (1995) for ESDA. The LISA is a class
+of local SA statistics. Every LISA is related to a global SA
+index, such as the MC and GR. geostan contains
+functions for calculating local Geary’s C and a local version of the MC,
+known as local Moran’s I.
+The idea of a local statistic is to index the amount and
+type of SA present at every discrete location on the map (e.g., for
+every county). The sum of all these LISA values is
+proportionate to its corresponding global SA index.
+Local Moran’s I for the \(i^{th}\)
+unit is \[I_i = z_i \sum_j^n w_{ij}
+z_j\] where \(z\) is the
+original variable in deviation form (\(z_i =
+y_i - \overline{y}\)), and \(W\)
+is the spatial connectivity matrix. These values are related to the
+Moran scatter plot: positive \(I_i\)
+values indicate positive SA, corresponding to quadrants I and III of the
+Moran scatter plot. Negative \(I_i\)
+values indicate negative SA, which appear in quadrants II and IV of the
+Moran scatter plot.
+The local Geary statistic \(C_i\) is
+the weighted sum of squared differences found in the numerator in the
+GR: \[C_i = \sum_j^n w_{ij} (y_i - y_j)^2
+\]
+The local Geary statistic is very sensitive to local outliers
+(negative SA), more so than local Moran’s I. This makes these two LISAs
+complementary: \(I_i\) is most useful
+to visualizing clustering behavior, whereas \(C_i\) is useful for visualizing local
+outliers. By default, geostan will first scale \(y\) using
+z <- scale(y, center = TRUE, scale = TRUE)
before
+calculating either of these LISAs.
+The lisa
function returns local Moran’s I values and
+indicates which quadrant of the Moran plot the point is found:
W <- shape2mat(georgia, "W")
x <- log(georgia$income)
@@ -193,13 +345,17 @@ Local indicators of spati
#> 4 1.473 HH
#> 5 -0.688 HL
#> 6 3.282 HH
-“HH” indicates a high value surrounded by high values; “LL” is a low surrounded by low values, and so on.
-The local Geary statistic can be calculated using the lg
function:
+“HH” indicates a high value surrounded by high values; “LL” is a low
+surrounded by low values, and so on.
+The local Geary statistic can be calculated using the lg
+function:
-Maps of the two local statistics highlight different qualities: \(I_i\) draws attention to clustering, whereas \(C_i\) draws attention to discontinuities and outliers:
+Maps of the two local statistics highlight different qualities: \(I_i\) draws attention to clustering,
+whereas \(C_i\) draws attention to
+discontinuities and outliers:
Ci_map <- ggplot(georgia) +
geom_sf(aes(fill=Ci)) +
@@ -215,13 +371,22 @@ Local indicators of spati
gridExtra::grid.arrange(Ci_map, Li_map, nrow = 1)
-Due to their complementary strengths, these two indices are best used together.
+Due to their complementary strengths, these two indices are best used
+together.
We can also consider what these spatial patterns mean in terms of the information content of our data; that is, the impact that SA might have on the amount of evidence that can be garnered from this data in an analysis. This is often described as effective sample size (ESS).
-The n_eff
function provides an approximate measure of ESS for spatially autocorrelated data (Griffith 2005). It is based on the simultaneous autoregressive (SAR) model. The function requires a value of the SA parameter, \(\rho\), from the SAR model and the number of observations in our data set. We can get a rough measure of ESS for a variable using the following code:
We can also consider what these spatial patterns mean in terms of the +information content of our data; that is, the impact that SA might have +on the amount of evidence that can be garnered from this data in an +analysis. This is often described as effective sample size (ESS).
+The n_eff
function provides an approximate measure of
+ESS for spatially autocorrelated data (Griffith 2005). It is based on the
+simultaneous autoregressive (SAR) model. The function requires a value
+of the SA parameter, \(\rho\), from the
+SAR model and the number of observations in our data set. We can get a
+rough measure of ESS for a variable using the following code:
x <- log(georgia$income)
rho <- aple(x, W)
@@ -230,14 +395,29 @@ Effective sample sizec(nominal_n = n, rho = rho, MC = mc(x, W), ESS = ess)
#> nominal_n rho MC ESS
#> 159.00000 0.70100 0.49800 23.34735
This tells us that, given the degree of SA in the ICE estimates (\(\rho = .70\)), our nominal sample size of 159 observations has the same information content as about 23 independent observations.
-This should provide some idea as to why it is so perilous to use conventional (non-spatial) statistical methods with spatial data. The odds of observing a strong correlation between any arbitrary pair of spatially patterned variables can be far greater than conventional methods report.
+This tells us that, given the degree of SA in the ICE estimates +(\(\rho = .70\)), our nominal sample +size of 159 observations has the same information content as about 23 +independent observations.
+This should provide some idea as to why it is so perilous to use +conventional (non-spatial) statistical methods with spatial data. The +odds of observing a strong correlation between any arbitrary pair of +spatially patterned variables can be far greater than conventional +methods report.
The sp_diag
function can also be used to evaluate spatial models. One of the purposes of the function is to identify spatial patterns in the model residuals, because SA violates a core assumption (independence) of conventional statistical models and because spatial patterns can provide valuable information that we should pay attention to.
To demonstrate, we first fit a Poisson model to the Georgia male mortality data. The following code fits a log-linear Poisson model, and it models mortality rates for each county separately (that’s provided by the “random effects” argument: re ~ GEOID
).
The sp_diag
function can also be used to evaluate
+spatial models. One of the purposes of the function is to identify
+spatial patterns in the model residuals, because SA violates a core
+assumption (independence) of conventional statistical models and because
+spatial patterns can provide valuable information that we should pay
+attention to.
To demonstrate, we first fit a Poisson model to the Georgia male
+mortality data. The following code fits a log-linear Poisson model, and
+it models mortality rates for each county separately (that’s provided by
+the “random effects” argument: re ~ GEOID
).
C <- shape2mat(georgia)
fit <- stan_glm(deaths.male ~ offset(log(pop.at.risk.male)),
@@ -269,78 +449,156 @@ Model diagnostics#> Spatial method (outcome): Exchangeable
#> Likelihood function: poisson
#> Link function: log
-#> Residual Moran Coefficient: 0.02271325
-#> WAIC: 1320.58
+#> Residual Moran Coefficient: 0.024477
+#> WAIC: 1319.94
#> Observations: 159
#> Data models (ME): none
#> Inference for Stan model: foundation.
#> 4 chains, each with iter=2000; warmup=1000; thin=1;
#> post-warmup draws per chain=1000, total post-warmup draws=4000.
#>
-#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
-#> intercept -4.181 0.001 0.022 -4.225 -4.197 -4.181 -4.166 -4.138 310 1.010
-#> alpha_tau 0.247 0.000 0.015 0.219 0.237 0.247 0.257 0.278 4210 0.999
+#> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat
+#> intercept -4.180 0.001 0.023 -4.225 -4.199 -4.180 -4.161 -4.136 289 1.01
+#> alpha_tau 0.247 0.000 0.016 0.219 0.234 0.247 0.260 0.280 3160 1.00
#>
-#> Samples were drawn using NUTS(diag_e) at Fri Mar 1 17:54:37 2024.
+#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:01:21 2024.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
The printed summary of results shows that the posterior probability distribution for the intercept, which in this case represents the mean log-mortality rate, is centered on \(-4.183\), which is a mortality rate of \(e^{-4.183} = 153\) per 10,000. The 2.5%
and 97.5%
columns provide the bounds on the 95% credible interval (CI) for each parameter; the CI for the intercept is [-4.22, -4.14].2
Provide the fitted model, fit
, and the spatial data, georgia
, to the sp_diag
function to see a set of spatial model diagnostics:
The printed summary of results shows that the posterior probability
+distribution for the intercept, which in this case represents the mean
+log-mortality rate, is centered on \(-4.183\), which is a mortality rate of
+\(e^{-4.183} = 153\) per 10,000. The
+2.5%
and 97.5%
columns provide the bounds on
+the 95% credible interval (CI) for each parameter; the CI for the
+intercept is [-4.22, -4.14].2
Provide the fitted model, fit
, and the spatial data,
+georgia
, to the sp_diag
function to see a set
+of spatial model diagnostics:
sp_diag(fit, georgia)
#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
The point-interval plot on the left shows the raw mortality rates (the raw outcome data) on the x-axis, the fitted values on the y-axis, and a ‘perfect fit’ (slope = 1, intercept = 0) line for reference. We can see that a number of the fitted values have posterior means that deviate from the observations; but this “shrinkage” towards the mean is not necessarily a problem. In fact, it is often desirable insofar as it indicates that these are counties for which our data provide very little evidence as to what the risk of death is (i.e., the population is very small). (For discussions of information pooling and other topics as well, see McElreath (2016) and Haining and Li (2020)).
-The middle panel is a Moran scatter plot of the model residuals, and the map shows the mean residual for each county.3
-In this case, there is a small amount of residual autocorrelation, and the map indicates that this derives from a slight north-south/metropolitan-rural trend. The trend in the residuals suggests that shrinking towards the mean mortality rate may be less than ideal in this case—and potentially biased (socially)—because it appears that county-level mortality risk may be somewhat higher in the southern half of the state than in the greater Atlanta metropolitan area.
-We could extend the model to address this concern by using one of geostan’s spatial models (e.g., stan_car
), by adding one or more (substantively meaningful) covariates, or potentially both.
The point-interval plot on the left shows the raw mortality rates +(the raw outcome data) on the x-axis, the fitted values on the y-axis, +and a ‘perfect fit’ (slope = 1, intercept = 0) line for reference. We +can see that a number of the fitted values have posterior means that +deviate from the observations; but this “shrinkage” towards the mean is +not necessarily a problem. In fact, it is often desirable insofar as it +indicates that these are counties for which our data provide very little +evidence as to what the risk of death is (i.e., the population is very +small). (For discussions of information pooling and other topics as +well, see McElreath (2016) and Haining and Li (2020)).
+The middle panel is a Moran scatter plot of the model residuals, and +the map shows the mean residual for each county.3
+In this case, there is a small amount of residual autocorrelation, +and the map indicates that this derives from a slight +north-south/metropolitan-rural trend. The trend in the residuals +suggests that shrinking towards the mean mortality rate may be less than +ideal in this case—and potentially biased (socially)—because it appears +that county-level mortality risk may be somewhat higher in the southern +half of the state than in the greater Atlanta metropolitan area.
+We could extend the model to address this concern by using one of
+geostan’s spatial models (e.g., stan_car
),
+by adding one or more (substantively meaningful) covariates, or
+potentially both.
Anselin, Luc. 1995. “Local Indicators of Spatial Association—Lisa.” Georgaphical Analysis 27 (2): 93–115.
+Chun, Yongwan, and Daniel A Griffith. 2013. Spatial Statistics and Geostatistics: Theory and Applications for Geographic Information Science and Technology. Sage.
+Gabry, Jonah, Daniel Simpson, Aki Vehtari, Michael Betancourt, and Andrew Gelman. 2019. “Visualization in Bayesian Workflow.” Journal of the Royal Statistical Society: Series A (Statistics in Society) 182 (2): 389–402.
+Good, I. J. 1983. “The Philosophy of Exploratory Data Analysis.” Philosophy of Science 50 (2): 283–95.
+Griffith, Daniel A. 2005. “Effective Geographic Sample Size in the Presence of Spatial Autocorrelation.” Annals of the Association of American Geographers 95 (4): 740–60.
+Haining, Robert, and Guangquan Li. 2020. Modelling Spatial and Spatial-Temporal Data: A Bayesian Approach. CRC Press.
+Koschinsky, Julia. 2020. “Discovering the Unexpected & Explicable: Scientific Reasoning and Research Design for Spatial Data Analysis.” In Central European Cartographic Conference and 68th German Cartography Congress—Eurocarto 2020, 21–25 September.
+Li, Honfei, Catherine A. Calder, and Noel Cressie. 2007. “Beyond Moran’s I: Testing for Spatial Dependence Based on the Spatial Autoregressive Model.” Geographical Analysis 39 (4): 357–75.
+McElreath, Richard. 2016. Statistical Rethinking: A Bayesian Course with Eexamples in R and Stan. CRC Press.
+Tukey, John W. 1993. “Exploratory Data Analysis: Past, Present, and Future.” Technical Report No. 302 (Series 2), Department of Statistics, Princeton University. Accessed 10 Nov, 2023 from https://apps.dtic.mil/sti/pdfs/ADA266775.pdf
.
https://apps.dtic.mil/sti/pdfs/ADA266775.pdf
.
Unwin, Antony. 1996. “Geary’s Contiguity Ratio.” The Economic and Social Review 27 (2): 145–59.
+For the most part, users do not need to know anything about sparse matrix objects to work with them. Objects from the Matrix package can typically be treated like objects of class “matrix”. Sometimes, however, you may need to make an explicit call the the Matrix package to access its methods. For example, colSums(W)
will produce an error, but Matrix::colSums(W)
will work as expected.↩︎
Stan will print important warning messages when Markov chain Monte Carlo (MCMC) diagnostics indicate any cause for concern, such as “Bulk Effective Samples Size (ESS) is too low.” Looking at the printed results, we can see that we kept a total of 4,000 MCMC samples for inference. If we then look at the “n_eff” (i.e., ESS) column in the table of results, we see that the effective sample size is smaller that the nominal sample size of 4,000 (this is almost always the case, due to serial autocorrelation in the MCMC samples). To see diagnostics for all model parameters at once, you can use the following function calls: rstan::stan_ess(fit$stanfit)
, rstan::stan_mcse(fit$stanfit)
, and rstan::stan_rhat(fit$stanfit)
(and see the corresponding help pages, ?rstan::stan_rhat
.)↩︎
The residuals have been taken at their marginal posterior means. However, there is more than one way to measure residual autocorrelation. For an alternative visualization that uses the entire posterior distribution of parameters and provides an estimate of the residual Moran coefficient that will match the printed model results above (MC = 0.022
), try sp_diag(fit, georgia, mc_style = "hist")
. For every MCMC sample from the joint distribution of parameters, a vector of residuals is calculated, and these are then used to calculate the Moran coefficient. Using style = "hist"
will show a histogram depicting all \(M\) samples of the residual Moran coefficient.↩︎
For the most part, users do not need to know anything
+about sparse matrix objects to work with them. Objects from the
+Matrix package can typically be treated like objects of
+class “matrix”. Sometimes, however, you may need to make an explicit
+call the the Matrix package to access its methods. For
+example, colSums(W)
will produce an error, but
+Matrix::colSums(W)
will work as expected.↩︎
Stan will print important warning messages when Markov
+chain Monte Carlo (MCMC) diagnostics indicate any cause for concern,
+such as “Bulk Effective Samples Size (ESS) is too low.” Looking at the
+printed results, we can see that we kept a total of 4,000 MCMC samples
+for inference. If we then look at the “n_eff” (i.e., ESS) column in the
+table of results, we see that the effective sample size is smaller that
+the nominal sample size of 4,000 (this is almost always the case, due to
+serial autocorrelation in the MCMC samples). To see diagnostics for all
+model parameters at once, you can use the following function calls:
+rstan::stan_ess(fit$stanfit)
,
+rstan::stan_mcse(fit$stanfit)
, and
+rstan::stan_rhat(fit$stanfit)
(and see the corresponding
+help pages, ?rstan::stan_rhat
.)↩︎
The residuals have been taken at their marginal
+posterior means. However, there is more than one way to measure residual
+autocorrelation. For an alternative visualization that uses the entire
+posterior distribution of parameters and provides an estimate of the
+residual Moran coefficient that will match the printed model results
+above (MC = 0.022
), try
+sp_diag(fit, georgia, mc_style = "hist")
. For every MCMC
+sample from the joint distribution of parameters, a vector of residuals
+is calculated, and these are then used to calculate the Moran
+coefficient. Using style = "hist"
will show a histogram
+depicting all \(M\) samples of the
+residual Moran coefficient.↩︎
This vignette provides a tutorial for fitting spatial regression models to raster data using geostan. The term “raster” is used here to refer to any regularly spaced set of observations such that the data can be represented spatially by a rectangular grid. Remotely sensed imagery is a common form of raster data.
-geostan can be used for spatial regression with fairly large raster data layers, although the functionality of these models will often be limited to the estimation of regression coefficients and spatial autocorrelation parameters. Limited experience thus far finds that geostan’s spatial autoregressive models can be fit to raster layers with two hundred thousand observations using a laptop computer and fewer than ten minutes of sampling time.
+This vignette provides a tutorial for fitting spatial regression +models to raster data using geostan. The term “raster” +is used here to refer to any regularly spaced set of observations such +that the data can be represented spatially by a rectangular grid. +Remotely sensed imagery is a common form of raster data.
+geostan can be used for spatial regression with +fairly large raster data layers, although the functionality of these +models will often be limited to the estimation of regression +coefficients and spatial autocorrelation parameters. Limited experience +thus far finds that geostan’s spatial autoregressive +models can be fit to raster layers with two hundred thousand +observations using a laptop computer and fewer than ten minutes of +sampling time.
-We will create a small raster data layer for the purpose of illustration.
+We will create a small raster data layer for the purpose of +illustration.
row <- 40
col <- 30
@@ -114,18 +127,39 @@ Demonstration
plot(grid[,'z'])
The following R code will fit a spatial autoregressive model to these data:
+The following R code will fit a spatial autoregressive model to these +data:
fit <- stan_sar(y ~ z, data = grid, C = W)
The stan_sar
function will take the spatial weights matrix W
and pass it through a function called prep_sar_data
which will calculate the eigenvalues of the spatial weights matrix using base::eigen
, as required for computational reasons. This step is prohibitive for large data sets (e.g., \(N = 100,000\)).
The following code would normally be used to fit a conditional autoregressive (CAR) model:
+The stan_sar
function will take the spatial weights
+matrix W
and pass it through a function called
+prep_sar_data
which will calculate the eigenvalues of the
+spatial weights matrix using base::eigen
, as required for
+computational reasons. This step is prohibitive for large data sets
+(e.g., \(N = 100,000\)).
The following code would normally be used to fit a conditional +autoregressive (CAR) model:
C <- shape2mat(grid, style = "B", queen = FALSE)
car_list <- prep_car_data(C, "WCAR")
fit <- stan_car(y ~ z, data = grid, car_parts = car_list)
Here, the prep_car_data
function calculates the eigenvalues of the spatial weights matrix using base::eigen
, which is not feasible for large N.
The prep_sar_data2
and prep_car_data2
functions are designed for raster layers. As input, they require the dimensions of the grid (number of rows and number of columns). The eigenvalues are produced very quickly using Equation 5 from Griffith (2000). The methods have certain restrictions. First, this is only applicable to raster layers—regularly spaced, rectangular grids of observations. Second, to define which observations are adjacent to one another, the “rook” criteria is used (spatially, only observations that share an edge are defined as neighbors to one another). Third, the spatial adjacency matrix will be row-standardized. This is standard (and required) for SAR models, and it corresponds to the “WCAR” specification of the CAR model (see Donegan 2022).
The following code will fit a SAR model to our grid
data, and is suitable for much larger raster layers:
Here, the prep_car_data
function calculates the
+eigenvalues of the spatial weights matrix using
+base::eigen
, which is not feasible for large N.
The prep_sar_data2
and prep_car_data2
+functions are designed for raster layers. As input, they require the
+dimensions of the grid (number of rows and number of columns). The
+eigenvalues are produced very quickly using Equation 5 from Griffith (2000). The methods have certain
+restrictions. First, this is only applicable to raster layers—regularly
+spaced, rectangular grids of observations. Second, to define which
+observations are adjacent to one another, the “rook” criteria is used
+(spatially, only observations that share an edge are defined as
+neighbors to one another). Third, the spatial adjacency matrix will be
+row-standardized. This is standard (and required) for SAR models, and it
+corresponds to the “WCAR” specification of the CAR model (see Donegan
+2022).
The following code will fit a SAR model to our grid
+data, and is suitable for much larger raster layers:
sar_list <- prep_sar_data2(row = row, col = col)
## Range of permissible rho values: -1 1
@@ -162,8 +196,8 @@
print(fit)
The user first creates the data list using prep_sar_data2
and then passes it to stan_sar
using the sar_parts
argument. Also, slim = TRUE
is invoked to prevent the model from collecting N-length parameter vectors and quantities of interest (such as fitted values and log-likelihoods).
For large data sets and complex models, slim = TRUE
can bring about computational improvements at the cost of losing some functionality (including the loss of convenience functions like sp_diag
, me_diag
, spatial
, resid
, and fitted
). Many quantities of interest, such as fitted values and spatial trend terms, can still be calculated manually using the data and parameter estimates (intercept, coefficients, and spatial autocorrelation parameters).
The favorable MCMC diagnostics for this model (sufficiently large effective sample sizes n_eff
, and Rhat
values very near to 1), based on just 250 post-warmup iterations per chain with four MCMC chains, provides some indication as to how computationally efficient these spatial autoregressive models can be.
Also, note that Stan usually samples more efficiently when variables have been mean-centered. Using the centerx = TRUE
argument in stan_sar
(or any other model-fitting function in geostan) can be very helpful in this respect. Also note that the SAR models in geostan are (generally) no less computationally-efficient than the CAR models, and may even be slightly more efficient.
The user first creates the data list using
+prep_sar_data2
and then passes it to stan_sar
+using the sar_parts
argument. Also,
+slim = TRUE
is invoked to prevent the model from collecting
+N-length parameter vectors and quantities of interest (such as fitted
+values and log-likelihoods).
For large data sets and complex models, slim = TRUE
can
+bring about computational improvements at the cost of losing some
+functionality (including the loss of convenience functions like
+sp_diag
, me_diag
, spatial
,
+resid
, and fitted
). Many quantities of
+interest, such as fitted values and spatial trend terms, can still be
+calculated manually using the data and parameter estimates (intercept,
+coefficients, and spatial autocorrelation parameters).
The favorable MCMC diagnostics for this model (sufficiently large
+effective sample sizes n_eff
, and Rhat
values
+very near to 1), based on just 250 post-warmup iterations per chain with
+four MCMC chains, provides some indication as to how computationally
+efficient these spatial autoregressive models can be.
Also, note that Stan usually samples more efficiently when variables
+have been mean-centered. Using the centerx = TRUE
argument
+in stan_sar
(or any other model-fitting function in
+geostan) can be very helpful in this respect. Also note
+that the SAR models in geostan are (generally) no less
+computationally-efficient than the CAR models, and may even be slightly
+more efficient.
Donegan, Connor. 2022. “Building Spatial Conditional Autoregressive Models in the Stan Programming Language.” OSF Preprints. https://doi.org/10.31219/osf.io/3ey65.
+Griffith, Daniel A. 2000. “Eigenfunction Properties and Approximations of Selected Incidence Matrices Employed in Spatial Analyses.” Linear Algebra and Its Applications 321 (1-3): 95–112.
+This vignette introduces users to the spatial measurement error (ME) models implemented in the geostan package (Donegan, Chun, and Griffith 2021). Variations on this methodology have been examined previously by multiple authors (Bernadinelli et al. 1997; Kang, Liu, and Cressie 2009; Logan et al. 2020; Xia and Carlin 1998).
-These models are designed for spatial models that use survey estimates as covariates. They are designed for use with American Community Survey (ACS) data and other large, government-backed surveys, although other data types may be appropriate as well. A premise of this methodology is that the survey includes a systematic spatial sampling design (i.e., the sampling procedure was stratified by the areal unit of interest, whether they be block groups, counties, or states).
-In a (spatial) regression analysis, measurement/sampling error in covariates can lead to model parameter estimates that are overly-confident and prone to bias. This can lead to under- or over-estimation of population risks and needs, mainly because the noisy survey-based covariate passes directly into predictive values. This may impact real communities and service providers (see Bazuin and Fraser 2013). Examining the standard errors or margins of error of small-area survey estimates is an important first step that can help one determine whether a measurement error model should be considered.
-Spatial measurement error models allow us to incorporate knowledge of data quality and human geography into our analyses, so that the final model results may better reflect our state of knowledge than they otherwise would. These models account for sampling error, but cannot address other sources of bias and error that may be present in survey estimates.
+This vignette introduces users to the spatial measurement error (ME) +models implemented in the geostan package (Donegan, Chun, and +Griffith 2021). Variations on this methodology have been +examined previously by multiple authors (Bernadinelli et al. 1997; Kang, Liu, and Cressie 2009; Logan et al. 2020; Xia and Carlin 1998).
+These models are designed for spatial models that use survey +estimates as covariates. They are designed for use with American +Community Survey (ACS) data and other large, government-backed surveys, +although other data types may be appropriate as well. A premise of this +methodology is that the survey includes a systematic spatial sampling +design (i.e., the sampling procedure was stratified by the areal unit of +interest, whether they be block groups, counties, or states).
+In a (spatial) regression analysis, measurement/sampling error in +covariates can lead to model parameter estimates that are +overly-confident and prone to bias. This can lead to under- or +over-estimation of population risks and needs, mainly because the noisy +survey-based covariate passes directly into predictive values. This may +impact real communities and service providers (see Bazuin and Fraser +2013). Examining the standard errors or margins of error of +small-area survey estimates is an important first step that can help one +determine whether a measurement error model should be considered.
+Spatial measurement error models allow us to incorporate knowledge of +data quality and human geography into our analyses, so that the final +model results may better reflect our state of knowledge than they +otherwise would. These models account for sampling error, but cannot +address other sources of bias and error that may be present in survey +estimates.
The measurement error (ME) models implemented in geostan are hierarchical Bayesian models (HBMs) that incorporate two sources of information:
+The measurement error (ME) models implemented in +geostan are hierarchical Bayesian models (HBMs) that +incorporate two sources of information:
The model treats the true value \(\boldsymbol x\) as an unknown parameter or a latent variable. The goal is to obtain a probability distribution for \(\boldsymbol x\). That information can then be incorporated into any of geostan’s regression or disease mapping models.
-To represent the general social and technical knowledge on which this model is premised, we write \(\mathcal{M}\). This background knowledge includes, for example, that the data came from the valid small-area spatial sampling design.
-The sampling distribution (a likelihood statement) states the following: if the true median income value is \(x_i\), then the probability of obtaining survey estimate \(z_i\) is \[ p(z_i | x_i , \mathcal{M}) = Gauss(z_i | x_i, s_i) \] where \(s_i\) is the standard error of the estimate. This is saying, more directly, that the true value \(x_i\) is probably within two standard errors of the estimate.
-Our background knowledge on \(x\) is the second component of the model. This is simply the knowledge that social variables tend to be spatially patterned. Specifically, extreme values are not implausible, although they tend to be clustered together.
-This information is encoded into a prior probability distribution for the unknown values \(\boldsymbol x\), using the spatial conditional autoregressive (CAR) model: \[ p(\boldsymbol x | \mathcal{M}) = Gauss(\boldsymbol x | \mu \boldsymbol 1, \boldsymbol \Sigma) \] where the bold symbols are used to indicate an object is a column vector or matrix.
-The equation above states that the unknown values \(\boldsymbol x\) are spatially autocorrelated, with a mean value of \(\mu\) and conditional autoregressive (CAR) covariance matrix \[\boldsymbol \Sigma = (I - \rho C)^{-1} M,\] where \(\rho\) is a spatial autocorrelation parameter, \(I\) is the \(n \times n\) identity matrix, and \(C\) is a spatial connectivity matrix. \(M\) is an \(n \times n\) diagonal matrix; its diagonal contains the row-sums of \(C\) multiplied by a scale parameter \(\tau\) (Donegan 2022, Table 1 WCAR specification).
-If \(\boldsymbol x\) is a rate variable—e.g., the poverty rate—it can be important to use a logit-transformation (Logan et al. 2020) as follows: \[ p(logit(\boldsymbol x) | \mathcal{M}) = Gauss(logit(\boldsymbol x) | \mu \boldsymbol 1, \boldsymbol \Sigma) \]
-The CAR parameters \(\mu\), \(\rho\), and \(\tau\) all require prior probability distributions; geostan uses the following by default:
-\[\begin{equation}
+ The model treats the true value \(\boldsymbol x\) as an unknown parameter or
+a latent variable. The goal is to obtain a probability distribution for
+\(\boldsymbol x\). That information can
+then be incorporated into any of geostan’s regression
+or disease mapping models. To represent the general social and technical knowledge on which this
+model is premised, we write \(\mathcal{M}\). This background knowledge
+includes, for example, that the data came from the valid small-area
+spatial sampling design. The sampling distribution (a likelihood statement) states the
+following: if the true median income value is \(x_i\), then the probability of obtaining
+survey estimate \(z_i\) is \[ p(z_i | x_i , \mathcal{M}) = Gauss(z_i | x_i,
+s_i) \] where \(s_i\) is the
+standard error of the estimate. This is saying, more directly, that the
+true value \(x_i\) is probably within
+two standard errors of the estimate. Our background knowledge on \(x\) is
+the second component of the model. This is simply the knowledge that
+social variables tend to be spatially patterned. Specifically, extreme
+values are not implausible, although they tend to be clustered
+together. This information is encoded into a prior probability distribution for
+the unknown values \(\boldsymbol x\),
+using the spatial conditional autoregressive (CAR) model: \[ p(\boldsymbol x | \mathcal{M}) =
+Gauss(\boldsymbol x | \mu \boldsymbol 1, \boldsymbol \Sigma) \]
+where the bold symbols are used to indicate an object is a column vector
+or matrix. The equation above states that the unknown values \(\boldsymbol x\) are spatially
+autocorrelated, with a mean value of \(\mu\) and conditional autoregressive (CAR)
+covariance matrix \[\boldsymbol \Sigma = (I -
+\rho C)^{-1} M,\] where \(\rho\)
+is a spatial autocorrelation parameter, \(I\) is the \(n
+\times n\) identity matrix, and \(C\) is a spatial connectivity matrix. \(M\) is an \(n
+\times n\) diagonal matrix; its diagonal contains the row-sums of
+\(C\) multiplied by a scale parameter
+\(\tau\) (Donegan 2022, Table 1 WCAR
+specification). If \(\boldsymbol x\) is a rate
+variable—e.g., the poverty rate—it can be important to use a
+logit-transformation (Logan et al. 2020) as follows: \[ p(logit(\boldsymbol x) | \mathcal{M}) =
+Gauss(logit(\boldsymbol x) | \mu \boldsymbol 1, \boldsymbol \Sigma)
+\] The CAR parameters \(\mu\), \(\rho\), and \(\tau\) all require prior probability
+distributions; geostan uses the following by
+default: \[\begin{equation}
\begin{split}
\mu &\sim Gauss(0, 100) \\
\tau &\sim Student_t(10, 0, 40) \\
\rho &\sim Uniform(\text{lower_bound}, \text{upper_bound})
\end{split}
\end{equation}\] The default prior for \(\rho\) is uniform across its entire support (determined by the extreme eigenvalues of \(C\)) The default prior for \(\rho\) is
+uniform across its entire support (determined by the extreme eigenvalues
+of \(C\)) For a sense of how all of this information gets combined in the model, consider two hypothetical survey estimates. The first has a small standard error—it is a reliable data point—and it is also similar to its surrounding counties. In that case, the probability distribution for \(x_i\) is going to look similar to the raw estimate and its standard error. Now consider a second county, where the estimate has a large standard error—it is unreliable—and the estimate is also dissimilar from its neighbors. In this case, we ought to consider it quite probable that the estimate \(x_i\) is far from the truth. This means that the probability distribution for \(x_i\) that comes from our model may be quite different from the raw estimate. That shift is often referred to as “shrinkage,” although the shift may be in either direction (up or down). If a non-spatial prior model for \(\boldsymbol x\) were used instead of the CAR model, then there would be a tendency for all survey estimates \(z_i\) that are far from the mean to be shrunk towards the ‘global mean’, instead of the local area mean. This can lead to blatant social biases in ME models when they are applied to spatial data (Donegan, Chun, and Griffith 2021). For that and other reasons, the spatial CAR model is often essential for spatial ME models, and it remains important to graphically evaluate results (see below). When we add a spatial ME model to a spatial regression model, the model results will automatically average over the uncertain inferences we make about \(\boldsymbol x\). The inferential biases that follow from ignoring measurement/observational uncertainty will be addressed, meaning that the probability distributions for the parameters will reflect the information we have on data quality. It is important to understand that there may still be un-modeled forms of bias and error in the ACS estimates—due to response bias and other sources—beyond the amount that is attributed to sampling error. For a sense of how all of this information gets combined in the
+model, consider two hypothetical survey estimates. The first has a small standard error—it is a reliable data point—and
+it is also similar to its surrounding counties. In that case, the
+probability distribution for \(x_i\) is
+going to look similar to the raw estimate and its standard error. Now
+consider a second county, where the estimate has a large standard
+error—it is unreliable—and the estimate is also dissimilar from its
+neighbors. In this case, we ought to consider it quite probable that the
+estimate \(x_i\) is far from the truth.
+This means that the probability distribution for \(x_i\) that comes from our model may be
+quite different from the raw estimate. That shift is often referred to
+as “shrinkage,” although the shift may be in either direction (up or
+down). If a non-spatial prior model for \(\boldsymbol x\) were used instead of the
+CAR model, then there would be a tendency for all survey estimates \(z_i\) that are far from the mean to be
+shrunk towards the ‘global mean’, instead of the local area mean. This
+can lead to blatant social biases in ME models when they are applied to
+spatial data (Donegan, Chun, and Griffith 2021).
+For that and other reasons, the spatial CAR model is often essential for
+spatial ME models, and it remains important to graphically evaluate
+results (see below). When we add a spatial ME model to a spatial regression model, the
+model results will automatically average over the uncertain inferences
+we make about \(\boldsymbol x\). The
+inferential biases that follow from ignoring measurement/observational
+uncertainty will be addressed, meaning that the probability
+distributions for the parameters will reflect the information we have on
+data quality. It is important to understand that there may still be
+un-modeled forms of bias and error in the ACS estimates—due to response
+bias and other sources—beyond the amount that is attributed to sampling
+error.Discussion
-
library(geostan)
data(georgia)
The line data(georgia)
loads the georgia
data set from the geostan package into your working environment. You can learn more about the data by entering ?georgia
to the R console.
The line data(georgia)
loads the georgia
+data set from the geostan package into your working
+environment. You can learn more about the data by entering
+?georgia
to the R console.
Users can add a spatial ME model to any geostan model. A list of data for the ME models can be prepared using the prep_me_data
function. The function requires at least two inputs from the user:
Users can add a spatial ME model to any geostan
+model. A list of data for the ME models can be prepared using the
+prep_me_data
function. The function requires at least two
+inputs from the user:
se
A data frame with survey standard errors.car_parts
A list of data created by the prep_car_data
function.car_parts
A list of data created by the
+prep_car_data
function.
It also has optional inputs:
prior
A list of prior distributions for the CAR model parameters.prior
A list of prior distributions for the CAR model
+parameters.
logit
A vector of logical values (TRUE
or FALSE
) specifying if the variate should be logit transformed. Only use this for rates (proportions between 0 and 1).logit
A vector of logical values (TRUE
or
+FALSE
) specifying if the variate should be logit
+transformed. Only use this for rates (proportions between 0 and 1).
The default prior distributions are designed to be fairly vague relative to ACS variables (including percentages from 0-100), assuming that magnitudes such as income have been log-transformed already.
-The logit-transformation is often required for skewed rates, such as the poverty rate. The user does not do this transformation on their own (i.e., before passing the data to the model), simply use the logit
argument in prep_me_data
. The logit transformation is also good to use for rates because it automatically enforces rates to remain within their zero-one bounds.
(There is a third optional argument, bounds
, but it is rarely needed. It will forces the values of the latent parameter vector to remain between the given bounds. For example, rates must be between 0 and 1, but this is enforced automatically by logit transform (which should be used in such cases). If it is needed for some reason, the bounds
argument will apply to all of the modeled covariates.)
As an example, we will be using estimates of the percent covered by health insurance in Georgia counties. We are going to convert this to a rate so that we can use the logit transform:
+The default prior distributions are designed to be fairly vague +relative to ACS variables (including percentages from 0-100), assuming +that magnitudes such as income have been log-transformed already.
+The logit-transformation is often required for skewed rates, such as
+the poverty rate. The user does not do this transformation on
+their own (i.e., before passing the data to the model), simply use the
+logit
argument in prep_me_data
. The logit
+transformation is also good to use for rates because it automatically
+enforces rates to remain within their zero-one bounds.
(There is a third optional argument, bounds
, but it is
+rarely needed. It will forces the values of the latent parameter vector
+to remain between the given bounds. For example, rates must be between 0
+and 1, but this is enforced automatically by logit transform (which
+should be used in such cases). If it is needed for some reason, the
+bounds
argument will apply to all of the modeled
+covariates.)
As an example, we will be using estimates of the percent covered by +health insurance in Georgia counties. We are going to convert this to a +rate so that we can use the logit transform:
georgia$insurance <- georgia$insurance / 100
georgia$insurance.se <- georgia$insurance.se / 100
Now we need to gather the standard errors georgia$insurance.se
into a data frame. The column name for the standard errors must match the name of the variable it refers to (“insurance”):
Now we need to gather the standard errors
+georgia$insurance.se
into a data frame. The column name for
+the standard errors must match the name of the variable it refers to
+(“insurance”):
SE <- data.frame(insurance = georgia$insurance.se)
If we intended to model additional survey-based covariates, we would simply add their standard errors to this data frame as another column.
-We create a binary spatial connectivity matrix, and then pass it to prep_car_data
to prepare our list of data for the CAR model:
If we intended to model additional survey-based covariates, we would +simply add their standard errors to this data frame as another +column.
+We create a binary spatial connectivity matrix, and then pass it to
+prep_car_data
to prepare our list of data for the CAR
+model:
C <- shape2mat(georgia, "B")
cars <- prep_car_data(C)
## Range of permissible rho values: -1.661134 1
-Now we use prep_me_data
to combine these items. We are going to use the logit-transform, since we are working with rates:
Now we use prep_me_data
to combine these items. We are
+going to use the logit-transform, since we are working with rates:
ME_list <- prep_me_data(se = SE,
car_parts = cars,
@@ -177,14 +302,17 @@ Preparing the data
Spatial ME model
-The following code chunk sets up a spatial model for male county mortality rates (ages 55-64) using the insurance
variable as a covariate (without the spatial ME model):
+The following code chunk sets up a spatial model for male county
+mortality rates (ages 55-64) using the insurance
variable
+as a covariate (without the spatial ME model):
fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + insurance,
centerx = TRUE,
data = georgia,
family = poisson(),
car_parts = cars)
-To add the spatial ME model to the above specification, we simply provide our ME_list
to the ME
argument:
+To add the spatial ME model to the above specification, we simply
+provide our ME_list
to the ME
argument:
fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + insurance,
centerx = TRUE,
@@ -203,9 +331,9 @@ Spatial ME model## 1 10 0 3
## lower upper
## 1 -1.7 1
-## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
+## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
-## https://mc-stan.org/misc/warnings.html#bulk-ess
+## https://mc-stan.org/misc/warnings.html#tail-ess
print(fit)
## Spatial Model Results
@@ -213,8 +341,8 @@ Spatial ME model## Spatial method (outcome): CAR
## Likelihood function: poisson
## Link function: log
-## Residual Moran Coefficient: -0.002759231
-## WAIC: 1322.25
+## Residual Moran Coefficient: -0.00231
+## WAIC: 1321.29
## Observations: 159
## Data models (ME): insurance
## Data model (ME prior): CAR (auto Gaussian)
@@ -222,18 +350,30 @@ Spatial ME model## 4 chains, each with iter=650; warmup=325; thin=1;
## post-warmup draws per chain=325, total post-warmup draws=1300.
##
-## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
-## intercept -4.165 0.002 0.057 -4.279 -4.199 -4.166 -4.134 -4.040 982 1.000
-## insurance -2.705 0.021 0.775 -4.168 -3.207 -2.729 -2.185 -1.108 1301 1.001
-## car_rho 0.869 0.002 0.085 0.673 0.821 0.886 0.933 0.985 1281 0.999
-## car_scale 0.446 0.001 0.032 0.388 0.423 0.444 0.466 0.511 1030 0.999
+## mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat
+## intercept -4.165 0.002 0.060 -4.278 -4.208 -4.164 -4.128 -4.048 635 1.004
+## insurance -2.738 0.022 0.783 -4.256 -3.418 -2.716 -2.090 -1.204 1312 1.000
+## car_rho 0.867 0.003 0.090 0.642 0.799 0.885 0.944 0.988 1142 1.002
+## car_scale 0.447 0.001 0.034 0.387 0.420 0.446 0.473 0.517 1261 0.999
##
-## Samples were drawn using NUTS(diag_e) at Fri Mar 1 17:55:22 2024.
+## Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:02:13 2024.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
-The CAR models in geostan can be highly efficient in terms of MCMC sampling, but the required number of iterations will depend on many characteristics of the model and data. Often the default iter = 2000
is more than sufficient (with cores = 4
). To speed up sampling with multi-core processing, use cores = 4
(to sample 4 chains in parallel).
-In the following section, methods for examining MCMC samples of the modeled covariate values \(\boldsymbol x\) will be illustrated. Note that the process of storing MCMC samples for \(\boldsymbol x\) can become computationally burdensome if you have multiple covariates and moderately large N. If you do not need the samples of \(\boldsymbol x\) you can use the drop
or slim
arguments, as in:
+The CAR models in geostan can be highly efficient in
+terms of MCMC sampling, but the required number of iterations will
+depend on many characteristics of the model and data. Often the default
+iter = 2000
is more than sufficient (with
+cores = 4
). To speed up sampling with multi-core
+processing, use cores = 4
(to sample 4 chains in
+parallel).
+In the following section, methods for examining MCMC samples of the
+modeled covariate values \(\boldsymbol
+x\) will be illustrated. Note that the process of storing MCMC
+samples for \(\boldsymbol x\) can
+become computationally burdensome if you have multiple covariates and
+moderately large N. If you do not need the samples of \(\boldsymbol x\) you can use the
+drop
or slim
arguments, as in:
fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + insurance,
centerx = TRUE,
@@ -243,26 +383,68 @@ Spatial ME model data = georgia,
car_parts = cars
)
-Using slim = TRUE
may be faster (particularly for larger data sets), but will prevent the collection of MCMC samples for quantities of interest such as fitted values, pointwise log-likelihood values (required for WAIC), and modeled covariates \(\boldsymbol x\). The drop
argument to the model fitting functions (e.g., stan_car
) provides more control over which parameters are saved.
+Using slim = TRUE
may be faster (particularly for larger
+data sets), but will prevent the collection of MCMC samples for
+quantities of interest such as fitted values, pointwise log-likelihood
+values (required for WAIC), and modeled covariates \(\boldsymbol x\). The drop
+argument to the model fitting functions (e.g., stan_car
)
+provides more control over which parameters are saved.
Visual diagnostics
-The me_diag
provides some useful diagnostics for understanding how the raw estimates compare with the modeled values.
-First, it provides a point-interval scatter plot that compares the raw survey estimates (on the horizontal axis) to the modeled values (on the vertical axis). The (posterior) probability distributions for the modeled values are represented by their 95% credible intervals. The intervals provide a sense of the quality of the ACS data—if the credible intervals on the modeled value are large, this tells us that the data are not particularly reliable.
-The me_diag
function also provides more direct visual depictions of the difference between the raw survey estimates \(z_i\) an the modeled values \(x_i\): \[\delta_i = z_i - x_i.\] We want to look for any strong spatial pattern in these \(\delta_i\) values, because that would be an indication of a bias. However, the magnitude of the \(\delta_i\) value is important to consider—there may be a pattern, but if the amount of shrinkage is very small, that pattern may not matter. (The model returns \(M\) samples from the posterior distribution of each \(x_i\); or, indexing by MCMC sample \(x_i^m\) (\(m\) is an index, not an exponent). The reported \(\delta_i\) values is the MCMC mean \(\delta_i = \frac{1}{M} \sum_m z_i - x_i^m\).)
-Two figures are provided to evaluate patterns in \(\delta_i\): first, a Moran scatter plot and, second, a map.
+The me_diag
provides some useful diagnostics for
+understanding how the raw estimates compare with the modeled values.
+First, it provides a point-interval scatter plot that compares the
+raw survey estimates (on the horizontal axis) to the modeled values (on
+the vertical axis). The (posterior) probability distributions for the
+modeled values are represented by their 95% credible intervals. The
+intervals provide a sense of the quality of the ACS data—if the
+credible intervals on the modeled value are large, this tells us that
+the data are not particularly reliable.
+The me_diag
function also provides more direct visual
+depictions of the difference between the raw survey estimates \(z_i\) an the modeled values \(x_i\): \[\delta_i = z_i - x_i.\] We want to look
+for any strong spatial pattern in these \(\delta_i\) values, because that would be an
+indication of a bias. However, the magnitude of the \(\delta_i\) value is important to
+consider—there may be a pattern, but if the amount of shrinkage is very
+small, that pattern may not matter. (The model returns \(M\) samples from the posterior distribution
+of each \(x_i\); or, indexing by MCMC
+sample \(x_i^m\) (\(m\) is an index, not an exponent). The
+reported \(\delta_i\) values is the
+MCMC mean \(\delta_i = \frac{1}{M} \sum_m z_i
+- x_i^m\).)
+Two figures are provided to evaluate patterns in \(\delta_i\): first, a Moran scatter plot
+and, second, a map.
me_diag(fit, 'insurance', georgia)
-In this case, the results do not look too concerning insofar as there are no conspicuous patterns. However, a number of the credible intervals on the modeled values are large, which indicates low data reliability. The fact that some of the \(\delta_i\) values are substantial also points to low data quality for those estimates.
+In this case, the results do not look too concerning insofar as there
+are no conspicuous patterns. However, a number of the credible intervals
+on the modeled values are large, which indicates low data reliability.
+The fact that some of the \(\delta_i\)
+values are substantial also points to low data quality for those
+estimates.
To see troubling
-It can be important to understand that the ME models are not independent from the rest of the model. When the modeled covariate \(\boldsymbol x\) enters a regression relationship, that regression relationship can impact the probability distribution for \(\boldsymbol x\) itself. (This is a feature of all Bayesian ME models). To examine the ME model in isolation, you can set the prior_only
argument to TRUE
(see ?stan_glm
or any of the spatial models).
+It can be important to understand that the ME models are not
+independent from the rest of the model. When the modeled covariate \(\boldsymbol x\) enters a regression
+relationship, that regression relationship can impact the probability
+distribution for \(\boldsymbol x\)
+itself. (This is a feature of all Bayesian ME models). To examine the ME
+model in isolation, you can set the prior_only
argument to
+TRUE
(see ?stan_glm
or any of the spatial
+models).
Working with MCMC samples from ME models
-geostan consists of pre-compiled Stan models, and users can always access the Markov chain Monte Carlo (MCMC) samples returned by Stan. When extracted as a matrix of samples (as below), each row of the matrix represents a draw from the joint probability distribution for all model parameters, and each column consists of samples from the marginal distribution of each parameter.
-The ME models return samples for \(x_i\) as well as the model parameters \(\mu\) (“mu_x_true”), \(\rho\) (“car_rho_x_true”), and \(\tau\) (“sigma_x_true”). We can access these using as.matrix
(also as.array
and as.data.frame
).
+geostan consists of pre-compiled Stan models, and
+users can always access the Markov chain Monte Carlo (MCMC) samples
+returned by Stan. When extracted as a matrix of samples (as below), each
+row of the matrix represents a draw from the joint probability
+distribution for all model parameters, and each column consists of
+samples from the marginal distribution of each parameter.
+The ME models return samples for \(x_i\) as well as the model parameters \(\mu\) (“mu_x_true”), \(\rho\) (“car_rho_x_true”), and \(\tau\) (“sigma_x_true”). We can access
+these using as.matrix
(also as.array
and
+as.data.frame
).
@@ -271,16 +453,17 @@ Working with MCMC samples from
head(mu.x)
## parameters
## iterations mu_x_true[1]
-## [1,] 1.791290
-## [2,] 1.829802
-## [3,] 1.752293
-## [4,] 1.765069
-## [5,] 1.864278
-## [6,] 1.689700
+## [1,] 1.829905
+## [2,] 1.812319
+## [3,] 1.861512
+## [4,] 1.677904
+## [5,] 1.827599
+## [6,] 1.749553
mean(mu.x)
-## [1] 1.785871
-We can visualize these using plot
or print a summary:
+## [1] 1.788058
+We can visualize these using plot
or print a
+summary:
## Inference for Stan model: foundation.
@@ -288,28 +471,34 @@ Working with MCMC samples from
## post-warmup draws per chain=325, total post-warmup draws=1300.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
-## mu_x_true[1] 1.79 0 0.09 1.63 1.74 1.79 1.83 1.94 648 1
-## car_rho_x_true[1] 0.91 0 0.07 0.73 0.87 0.92 0.95 0.99 1138 1
-## sigma_x_true[1] 0.48 0 0.03 0.42 0.46 0.48 0.50 0.55 1393 1
+## mu_x_true[1] 1.79 0.01 0.10 1.62 1.74 1.78 1.83 1.97 271 1.01
+## car_rho_x_true[1] 0.91 0.00 0.07 0.74 0.87 0.92 0.96 1.00 871 1.00
+## sigma_x_true[1] 0.48 0.00 0.04 0.42 0.46 0.48 0.51 0.56 1134 1.00
##
-## Samples were drawn using NUTS(diag_e) at Fri Mar 1 17:55:22 2024.
+## Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:02:13 2024.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
-To extract samples from the joint probability distribution for \(\boldsymbol x\), use the generic parameter name “x_true”:
+To extract samples from the joint probability distribution for \(\boldsymbol x\), use the generic parameter
+name “x_true”:
## [1] 1300 159
-If we wanted to calculate the mean of each of these marginal distributions (one for every \(x_i\)), we could use apply
with MARGIN = 2
to summarize by column:
+If we wanted to calculate the mean of each of these marginal
+distributions (one for every \(x_i\)),
+we could use apply
with MARGIN = 2
to
+summarize by column:
## x_insurance[1] x_insurance[2] x_insurance[3] x_insurance[4] x_insurance[5]
-## 0.8366455 0.7997958 0.8517097 0.8521625 0.9179326
+## 0.8366154 0.7995531 0.8519096 0.8521234 0.9178645
## x_insurance[6]
-## 0.8702921
-The vector x.mu
contains estimates (posterior means) for \(x_i\). We might want to use these to plot the residuals or fitted values against the predictor:
+## 0.8702474
+The vector x.mu
contains estimates (posterior means) for
+\(x_i\). We might want to use these to
+plot the residuals or fitted values against the predictor:
rs <- resid(fit)$mean
plot(x.mu, rs)
@@ -320,7 +509,10 @@ Working with MCMC samples from
Non-spatial ME models
-If the ME
list doesn’t have a slot with car_parts
, geostan will automatically use a non-spatial Student’s t model instead of the CAR model: \[ p(\boldsymbol x | \mathcal{M}) = Student(\boldsymbol x | \nu, \mu \boldsymbol 1, \sigma) \]
+If the ME
list doesn’t have a slot with
+car_parts
, geostan will automatically use
+a non-spatial Student’s t model instead of the CAR model: \[ p(\boldsymbol x | \mathcal{M}) =
+Student(\boldsymbol x | \nu, \mu \boldsymbol 1, \sigma) \]
ME_nsp <- prep_me_data(
se = data.frame(insurance = georgia$insurance.se),
@@ -331,7 +523,8 @@ Non-spatial ME models
Multiple covariates
-To model multiple covariates, simply add them to the data frame of standard errors:
+To model multiple covariates, simply add them to the data frame of
+standard errors:
georgia$college <- georgia$college / 100
georgia$college.se <- georgia$college.se / 100
@@ -348,13 +541,21 @@ Multiple covariates= georgia,
iter = 700
)
-Results can be examined one at a time usign me_diag(fit, 'insurance', georgia)
and me_diag(fit, 'college', georgia)
.
+Results can be examined one at a time usign
+me_diag(fit, 'insurance', georgia)
and
+me_diag(fit, 'college', georgia)
.
Log transforms
-Income and other magnitudes are often log transformed. In that case, the survey standard errors also need to be transformed. The se_log
function provide approximate standard errors for this purpose.
-When using se_log
, pass in the variable itself (untransformed) and its standard errors. Here is a full workflow for a spatial mortality model with covariate measurement error:
+Income and other magnitudes are often log transformed. In that case,
+the survey standard errors also need to be transformed. The
+se_log
function provide approximate standard errors for
+this purpose.
+When using se_log
, pass in the variable itself
+(untransformed) and its standard errors. Here is a full
+workflow for a spatial mortality model with covariate measurement
+error:
data(georgia)
@@ -411,27 +612,45 @@ Log transforms
References
-
-
-Bazuin, Joshua Theodore, and James Curtis Fraser. 2013. “How the ACS Gets It Wrong: The Story of the American Community Survey and a Small, Inner City Neighborhood.” Applied Geography 45 (12): 292–302. https://doi.org/10.1016/j.apgeog.2013.08.013.
+
+
+Bazuin, Joshua Theodore, and James Curtis Fraser. 2013. “How the
+ACS Gets It Wrong: The Story of the American
+Community Survey and a Small, Inner City
+Neighborhood.” Applied Geography 45 (12): 292–302. https://doi.org/10.1016/j.apgeog.2013.08.013.
-
-Bernadinelli, L, Cristian Pascutto, NG Best, and WR Gilks. 1997. “Disease Mapping with Errors in Covariates.” Statistics in Medicine 16 (7): 741–52.
+
+Bernadinelli, L, Cristian Pascutto, NG Best, and WR Gilks. 1997.
+“Disease Mapping with Errors in Covariates.” Statistics
+in Medicine 16 (7): 741–52.
-
-Donegan, Connor. 2022. “Building Spatial Conditional Autoregressive Models in the Stan Programming Language.” OSF Preprints. https://doi.org/10.31219/osf.io/3ey65.
+
+Donegan, Connor. 2022. “Building Spatial Conditional
+Autoregressive Models in the Stan Programming
+Language.” OSF Preprints. https://doi.org/10.31219/osf.io/3ey65.
-
-Donegan, Connor, Yongwan Chun, and Daniel A Griffith. 2021. “Modeling Community Health with Areal Data: Bayesian Inference with Survey Standard Errors and Spatial Structure.” International Journal of Environmental Research and Public Health 18 (13): 6856. https://doi.org/10.3390/ijerph18136856.
+
+Donegan, Connor, Yongwan Chun, and Daniel A Griffith. 2021.
+“Modeling Community Health with Areal Data: Bayesian Inference
+with Survey Standard Errors and Spatial Structure.”
+International Journal of Environmental Research and Public
+Health 18 (13): 6856. https://doi.org/10.3390/ijerph18136856.
-
-Kang, Emily L, Desheng Liu, and Noel Cressie. 2009. “Statistical Analysis of Small-Area Data Based on Independence, Spatial, Non-Hierarchical, and Hierarchical Models.” Computational Statistics & Data Analysis 53 (8): 3016–32.
+
+Kang, Emily L, Desheng Liu, and Noel Cressie. 2009. “Statistical
+Analysis of Small-Area Data Based on Independence, Spatial,
+Non-Hierarchical, and Hierarchical Models.” Computational
+Statistics & Data Analysis 53 (8): 3016–32.
-
-Logan, John R, Cici Bauer, Jun Ke, Hongwei Xu, and Fan Li. 2020. “Models for Small Area Estimation for Census Tracts.” Geographical Analysis 52 (3): 325–50.
+
+Logan, John R, Cici Bauer, Jun Ke, Hongwei Xu, and Fan Li. 2020.
+“Models for Small Area Estimation for Census Tracts.”
+Geographical Analysis 52 (3): 325–50.
-
-Xia, Hong, and Bradley P Carlin. 1998. “Spatio-Temporal Models with Errors in Covariates: Mapping Ohio Lung Cancer Mortality.” Statistics in Medicine 17 (18): 2025–43.
+
+Xia, Hong, and Bradley P Carlin. 1998. “Spatio-Temporal Models
+with Errors in Covariates: Mapping Ohio Lung Cancer Mortality.”
+Statistics in Medicine 17 (18): 2025–43.
diff --git a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png
index a4cc7538..060ef96a 100644
Binary files a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png and b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png differ
diff --git a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png
index 3f6077f1..0738c779 100644
Binary files a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png differ
diff --git a/docs/authors.html b/docs/authors.html
index 527b4ce5..a2858abe 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -17,7 +17,7 @@
diff --git a/docs/index.html b/docs/index.html
index 3d4e3016..6d6a206b 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -40,7 +40,7 @@
@@ -80,7 +80,7 @@
geostan: Bayesian spatial analysis
-The geostan R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package help pages and vignettes on spatial autocorrelation, spatial measurement error models, and spatial regression with raster layers.
+The geostan R package supports a complete spatial analysis workflow with Bayesian models for areal data, including a suite of functions for visualizing spatial data and model results. For demonstrations and discussion, see the package help pages and vignettes on spatial autocorrelation, spatial measurement error models, spatial regression with raster layers, and building custom spatial model in Stan.
The package is particularly suitable for public health research with spatial data, and complements the surveil R package for time series analysis of public health surveillance data.
geostan models were built using Stan, a state-of-the-art platform for Bayesian modeling.
@@ -134,12 +134,13 @@ Usage
Load the package and the georgia
county mortality data set (ages 55-64, years 2014-2018):
The sp_diag
function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map:
A <- shape2mat(georgia, style = "B")
sp_diag(georgia$rate.female, georgia, w = A)
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
+#> 3 NA values found in x will be dropped from data x and matrix w
#> Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
There are three censored observations in the georgia
female mortality data, which means there were 9 or fewer deaths in those counties. The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. By using the censor_point
argument we include our information on the censored observations to obtain results for all counties:
@@ -167,14 +168,19 @@ Usage
#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
#> Distribution: uniform
#> lower upper
-#> 1 -1.7 1
+#> 1 -1.7 1
+#> Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
+#> Running the chains for more iterations may help. See
+#> https://mc-stan.org/misc/warnings.html#bulk-ess
+#> Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
+#> Running the chains for more iterations may help. See
+#> https://mc-stan.org/misc/warnings.html#tail-ess
Passing a fitted model to the sp_diag
function will return a set of diagnostics for spatial models:
sp_diag(fit, georgia, w = A)
#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x; they will be dropped from the data before creating the Moran plot. If matrix w was row-standardized, it no longer is. You may want to use a binary connectivity matrix using style = 'B' in shape2mat.
-#> Warning: Removed 3 rows containing missing values
-#> (`geom_pointrange()`).
+#> 3 NA values found in x will be dropped from data x and matrix w
+#> Warning: Removed 3 rows containing missing values (`geom_pointrange()`).
The print
method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean
, effective sample size n_eff
, and the R-hat statistic Rhat
):
@@ -184,8 +190,8 @@ Usage
#> Spatial method (outcome): CAR
#> Likelihood function: poisson
#> Link function: log
-#> Residual Moran Coefficient: 0.0021755
-#> WAIC: 1291.91
+#> Residual Moran Coefficient: -0.0004015
+#> WAIC: 1290.5
#> Observations: 159
#> Data models (ME): none
#> Inference for Stan model: foundation.
@@ -193,11 +199,11 @@ Usage
#> post-warmup draws per chain=1000, total post-warmup draws=4000.
#>
#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
-#> intercept -4.673 0.002 0.093 -4.843 -4.716 -4.674 -4.630 -4.497 1636 1.000
-#> car_rho 0.922 0.001 0.058 0.778 0.893 0.935 0.967 0.995 3214 1.001
-#> car_scale 0.458 0.001 0.035 0.393 0.433 0.455 0.481 0.532 3867 1.000
+#> intercept -4.620 0.055 0.418 -4.851 -4.719 -4.674 -4.627 -4.334 57 1.074
+#> car_rho 0.926 0.001 0.057 0.783 0.896 0.937 0.968 0.999 1494 1.002
+#> car_scale 0.457 0.001 0.035 0.394 0.433 0.455 0.479 0.529 3559 1.000
#>
-#> Samples were drawn using NUTS(diag_e) at Wed Oct 4 12:20:45 2023.
+#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 14:14:50 2024.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
diff --git a/docs/news/index.html b/docs/news/index.html
index 69d08ecd..73b68718 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
@@ -51,13 +51,20 @@ Changelog
-geostan 0.5.4
+geostan 0.6.0
+
+New Additions
+The model fitting functions (stan_glm
, stan_car
, etc.) now allow for missing data in the outcome variable and a new vignette provides the details. This functionality is not available for auto-Gaussian models - that is, CAR and SAR models that have been fit to continuous outcome variables - but is available for all other available models (including eigenvector spatial filtering stan_esf
models for continuous outcomes, and all models for count outcomes [binomial and Poisson models]).
+
+
+
+geostan 0.5.42024-03-03
Minor updates to the vignettees and documentation, also re-compiled geostan models using the latest StanHeaders (fixing an error on CRAN).
geostan 0.5.32023-11-24
-
-Minor changes
+
+Minor changes
The gamma
function (which is available to help set prior distributions) has been renamed to geostan::gamma2
to avoid conflict with base::gamma
.
Some code for geostan::stan_car
was cleaned up to avoid sending duplicate variables to the Stan model when a spatial ME (measurement error) model was used: https://github.com/ConnorDonegan/geostan/issues/17. This should not change any functionality and there is no reason to suspect that results were ever impacted by the duplicate variables.
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index c521581e..aec39789 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -1,4 +1,4 @@
-pandoc: 2.9.2.1
+pandoc: 3.1.1
pkgdown: 2.0.7
pkgdown_sha: ~
articles:
@@ -6,7 +6,7 @@ articles:
measuring-sa: measuring-sa.html
raster-regression: raster-regression.html
spatial-me-models: spatial-me-models.html
-last_built: 2024-03-01T23:54Z
+last_built: 2024-03-19T19:00Z
urls:
reference: https://connordonegan.github.io/geostan/reference
article: https://connordonegan.github.io/geostan/articles
diff --git a/docs/reference/aple.html b/docs/reference/aple.html
index 4d738d0a..496ce5e8 100644
--- a/docs/reference/aple.html
+++ b/docs/reference/aple.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/auto_gaussian.html b/docs/reference/auto_gaussian.html
index ccbaf0e4..e4eff5a7 100644
--- a/docs/reference/auto_gaussian.html
+++ b/docs/reference/auto_gaussian.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/edges.html b/docs/reference/edges.html
index 0499c01d..d8d6277d 100644
--- a/docs/reference/edges.html
+++ b/docs/reference/edges.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/eigen_grid.html b/docs/reference/eigen_grid.html
index 86cc920e..87aa4e8c 100644
--- a/docs/reference/eigen_grid.html
+++ b/docs/reference/eigen_grid.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/expected_mc.html b/docs/reference/expected_mc.html
index 588eae8e..fb7126d0 100644
--- a/docs/reference/expected_mc.html
+++ b/docs/reference/expected_mc.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/figures/README-unnamed-chunk-5-1.png b/docs/reference/figures/README-unnamed-chunk-5-1.png
index 29d309fc..703ce6e8 100644
Binary files a/docs/reference/figures/README-unnamed-chunk-5-1.png and b/docs/reference/figures/README-unnamed-chunk-5-1.png differ
diff --git a/docs/reference/georgia.html b/docs/reference/georgia.html
index f8b5703f..6673ab97 100644
--- a/docs/reference/georgia.html
+++ b/docs/reference/georgia.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/geostan-package.html b/docs/reference/geostan-package.html
index 0be7fd49..85000f52 100644
--- a/docs/reference/geostan-package.html
+++ b/docs/reference/geostan-package.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/get_shp.html b/docs/reference/get_shp.html
index 1c981b5b..6e8a058e 100644
--- a/docs/reference/get_shp.html
+++ b/docs/reference/get_shp.html
@@ -17,7 +17,7 @@
diff --git a/docs/reference/gr.html b/docs/reference/gr.html
index b6af521b..7ade92e1 100644
--- a/docs/reference/gr.html
+++ b/docs/reference/gr.html
@@ -17,7 +17,7 @@
The CAR model is:
-Normal(Mu, Sigma), Sigma = (I - rho * C)^-1 * M * tau^2,
where I
is the identity matrix, rho
is a spatial autocorrelation parameter, C
is a connectivity matrix, and M * tau^2
is a diagonal matrix with conditional variances on the diagonal. tau^2
is a (scalar) scale parameter.
In the WCAR specification, C
is the row-standardized version of A
. This means that the non-zero elements of A
will be converted to 1/N_i
where N_i
is the number of neighbors for the i
th site (obtained using Matrix::rowSums(A)
. The conditional variances (on the diagonal of M * tau^2
), are also proportional to 1/N_i
.
The ACAR specification is from Cressie, Perrin and Thomas-Agnon (2005); also see Cressie and Wikle (2011, p. 188) and Donegan (2021).
diff --git a/docs/reference/prep_car_data2.html b/docs/reference/prep_car_data2.html index 79a7d0df..96142539 100644 --- a/docs/reference/prep_car_data2.html +++ b/docs/reference/prep_car_data2.html @@ -17,7 +17,7 @@~ Normal(0, tau^2 * lambda_j^2) beta_j
The scale parameter for this prior is the product of two terms: lambda_j^2
is specific to the variable beta_j
, and tau^2
is known as the global shrinkage parameter.
The global shrinkage parameter is assigned a half-Cauchy prior:
-~ Cauchy(0, global_scale * sigma) tau
where global_scale
is provided by the user and sigma
is the scale parameter for the outcome variable; for Poisson and binomial models, sigma is fixed at one. Use global_scale
to control the overall sparsity of the model.
The second part of the model is a Student's t prior for lambda_j
. Most lambda_j
will be small, since the model is half-Cauchy:
~ Cauchy(0, 1) lambda_j
This model results in most lambda_j
being small, but due to the long tails of the Cauchy distribution, strong evidence in the data can force any particular lambda_j
to be large. Piironen and Vehtari (2017) adjust the model so that those large lambda_j
are effectively assigned a Student's t model:
~ Student_t(slab_df, 0, slab_scale) Big_lambda_j
This is a schematic representation of the model; see Piironen and Vehtari (2017) or Donegan et al. (2020) for details.
When rates = FALSE
and the model is Poisson or Binomial, the fitted values returned by the fitted
method are the expected value of the response variable. The rates
argument is used to translate count outcomes to rates by dividing by the appropriate denominator. The behavior of the rates
argument depends on the model specification. Consider a Poisson model of disease incidence, such as the following intercept-only case:
stan_glm(y ~ offset(log(E)),
- fit <-data = data,
- family = poisson())
If the fitted values are extracted using rates = FALSE
, then fitted(fit)
will return the expectation of \(y\). If rates = TRUE
(the default), then fitted(fit)
will return the expected value of the rate \(\frac{y}{E}\).
If a binomial model is used instead of the Poisson, then using rates = TRUE
will return the expectation of \(\frac{y}{N}\) where \(N\) is the sum of the number of 'successes' and 'failures', as in:
stan_glm(cbind(successes, failures) ~ 1,
- fit <-data = data,
- family = binomial())
To include a varying intercept (or "random effects") term, alpha_re
, specify the grouping variable here using formula syntax, as in ~ ID
. Then, alpha_re
is a vector of parameters added to the linear predictor of the model, and:
~ N(0, alpha_tau)
- alpha_re ~ Student_t(d.f., location, scale). alpha_tau
With the CAR model, any alpha_re
term should be at a different level or scale than the observations; that is, at a different scale than the autocorrelation structure of the CAR model itself.
To include a varying intercept (or "random effects") term, alpha_re
, specify the grouping variable here using formula syntax, as in ~ ID
. Then, alpha_re
is a vector of parameters added to the linear predictor of the model, and:
~ N(0, alpha_tau)
- alpha_re ~ Student_t(d.f., location, scale). alpha_tau
To include a varying intercept (or "random effects") term, alpha_re
, specify the grouping variable here using formula syntax, as in ~ ID
. Then, alpha_re
is a vector of parameters added to the linear predictor of the model, and:
~ N(0, alpha_tau)
- alpha_re ~ Student_t(d.f., location, scale). alpha_tau
slx
argument:
-stan_glm(y ~ x1 + x2, slx = ~ x1 + x2, \...),
which is a shortcut for
-stan_glm(y ~ I(W \%*\% x1) + I(W \%*\% x2) + x1 + x2, \...)
SLX terms will always be prepended to the design matrix, as above, which is important to know when setting prior distributions for regression coefficients.
For measurement error (ME) models, the SLX argument is the only way to include spatially lagged covariates since the SLX term needs to be re-calculated on each iteration of the MCMC algorithm.
To include a varying intercept (or "random effects") term, alpha_re
, specify the grouping variable here using formula syntax, as in ~ ID
. Then, alpha_re
is a vector of parameters added to the linear predictor of the model, and:
~ N(0, alpha_tau)
- alpha_re ~ Student_t(d.f., location, scale). alpha_tau
Before using this term, read the Details
section and the type
argument. Specifically, if you use type = bym
, then an observational-level re
term is already included in the model. (Similar for type = bym2
.)
The terms \(\tilde{\phi}\), \(\tilde{\theta}\) are standard normal deviates, \(\rho\) is restricted to values between zero and one, and \(S\) is the 'scale_factor' (a constant term provided by the user). By default, the 'scale_factor' is equal to one, so that it does nothing. Riebler et al. (2016) argue that the interpretation or meaning of the scale of the ICAR model depends on the graph structure of the connectivity matrix \(C\). This implies that the same prior distribution assigned to \(\tau_s\) will differ in its implications if \(C\) is changed; in other words, the priors are not transportable across models, and models that use the same nominal prior actually have different priors assigned to \(\tau_s\).
Borrowing R
code from Morris (2017) and following Freni-Sterrantino et al. (2018), the following R
code can be used to create the 'scale_factor' \(S\) for the BYM2 model (note, this requires the INLA R package), given a spatial adjacency matrix, \(C\):
## create a list of data for stan_icar
- geostan::prep_icar_data(C)
- icar.data <-## calculate scale_factor for each of k connected group of nodes
- icar.data$k
- k <- vector(mode = "numeric", length = k)
- scale_factor <-for (j in 1:k) {
- which(icar.data$comp_id == j)
- g.idx <-if (length(g.idx) == 1) {
- 1
- scale_factor[j] <-next
-
- } C[g.idx, g.idx]
- Cg <- scale_c(Cg)
- scale_factor[j] <- }
## create a list of data for stan_icar
+icar.data <- geostan::prep_icar_data(C)
+## calculate scale_factor for each of k connected group of nodes
+k <- icar.data$k
+scale_factor <- vector(mode = "numeric", length = k)
+for (j in 1:k) {
+ g.idx <- which(icar.data$comp_id == j)
+ if (length(g.idx) == 1) {
+ scale_factor[j] <- 1
+ next
+ }
+ Cg <- C[g.idx, g.idx]
+ scale_factor[j] <- scale_c(Cg)
+}
This code adjusts for 'islands' or areas with zero neighbors, and it also handles disconnected graph structures (see Donegan and Morris 2021). Following Freni-Sterrantino (2018), disconnected components of the graph structure are given their own intercept term; however, this value is added to \(\phi\) automatically inside the Stan model. Therefore, the user never needs to make any adjustments for this term. (To avoid complications from using a disconnected graph structure, you can apply a proper CAR model instead of the ICAR: stan_car
).
Note, the code above requires the scale_c
function; it has package dependencies that are not included in geostan
. To use scale_c
, you have to load the following R
function:
#' compute scaling factor for adjacency matrix, accounting for differences in spatial connectivity
-#'
-#' @param C connectivity matrix
-#'
-#' @details
-#'
-#' Requires the following packages:
-#'
-#' library(Matrix)
-#' library(INLA);
-#' library(spdep)
-#' library(igraph)
-#'
-#' @source
-#'
-#' Morris, Mitzi (2017). Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data. <https://mc-stan.org/users/documentation/case-studies/icar_stan.html>
-#'
- function(C) {
- scale_c <- function(x) exp(mean(log(x)))
- geometric_mean <- dim(C)[1]
- N = Diagonal(N, rowSums(C)) - C
- Q = Q + Diagonal(N) * max(diag(Q)) * sqrt(.Machine$double.eps)
- Q_pert = inla.qinv(Q_pert, constr=list(A = matrix(1,1,N),e=0))
- Q_inv = geometric_mean(Matrix::diag(Q_inv))
- scaling_factor <-return(scaling_factor)
- }
#' compute scaling factor for adjacency matrix, accounting for differences in spatial connectivity
+#'
+#' @param C connectivity matrix
+#'
+#' @details
+#'
+#' Requires the following packages:
+#'
+#' library(Matrix)
+#' library(INLA);
+#' library(spdep)
+#' library(igraph)
+#'
+#' @source
+#'
+#' Morris, Mitzi (2017). Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data. <https://mc-stan.org/users/documentation/case-studies/icar_stan.html>
+#'
+scale_c <- function(C) {
+ geometric_mean <- function(x) exp(mean(log(x)))
+ N = dim(C)[1]
+ Q = Diagonal(N, rowSums(C)) - C
+ Q_pert = Q + Diagonal(N) * max(diag(Q)) * sqrt(.Machine$double.eps)
+ Q_inv = inla.qinv(Q_pert, constr=list(A = matrix(1,1,N),e=0))
+ scaling_factor <- geometric_mean(Matrix::diag(Q_inv))
+ return(scaling_factor)
+}
To include a varying intercept (or "random effects") term, alpha_re
, specify the grouping variable here using formula syntax, as in ~ ID
. Then, alpha_re
is a vector of parameters added to the linear predictor of the model, and:
~ N(0, alpha_tau)
- alpha_re ~ Student_t(d.f., location, scale). alpha_tau
With the SAR model, any alpha_re
term should be at a different level or scale than the observations; that is, at a different scale than the autocorrelation structure of the SAR model itself.