chapter05.Rmd

---
title: "Chapter 05"
author: "Scott Spencer"
date: "8/22/2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, 
                      warning = FALSE, message = FALSE, error = FALSE)
library(dplyr); library(tidyr); library(rstan); library(skimr); library(ggplot2); library(ggthemes)
theme_set(theme_tufte(base_family = 'sans'))
```

The code below is meant as a directly-in-Stan translation of the examples in Chapter 5 of McElreath's *Statistical Rethinking*.

# Chapter 5

```{r}
data('WaffleDivorce', package = 'rethinking')
d  <- WaffleDivorce; rm(WaffleDivorce)
```

Figure 5.1

```{r}
ggplot(d) +
  stat_smooth(aes(WaffleHouses/Population, Divorce), method = 'lm', level = .89, 
              fullrange = T, color = 'black', alpha = .1, lwd = .3) +
  geom_point(aes(WaffleHouses/Population, Divorce), 
             shape = 21, color = 'dodgerblue') +
  geom_text(data = filter(d, Loc %in% c('AL', 'AR', 'GA', 'OK', 'ME', 'NJ', 'SC')),
             aes(x = WaffleHouses/Population + 1.2, y = Divorce, label = Loc), size = 3) +
  scale_x_continuous(limits = c(0, 50)) +
  labs(x = 'Waffle Houses per million', y = 'Divorce Rate')
```

## 5.1 Spurious association

Standardize the predictors

```{r}
d <- d %>% 
  mutate(MedianAgeMarriage_z = (MedianAgeMarriage - mean(MedianAgeMarriage)) / 
                               sd(MedianAgeMarriage),
         Marriage_z = (Marriage - mean(Marriage)) / sd(Marriage))
```


Write a Stan model

```{stan output.var="m05_1"}
data {
  int<lower=0> N;
  vector[N] divorce;
  vector[N] median_age_z;
}
parameters {
  real a;
  real bA;
  real<lower=0, upper=10> sigma;
}
model {
  vector[N] mu = a + median_age_z * bA;
  target += normal_lpdf(divorce | mu, sigma);
  
  target += normal_lpdf(a | 10, 10);
  target += normal_lpdf(bA | 0, 10);
}
```

Organize data for Stan model, and sample.

```{r}
dat <- list(
  N = NROW(d),
  divorce = d$Divorce,
  median_age_z = d$MedianAgeMarriage_z
)

fit05_1 <- sampling(m05_1, data = dat, iter = 1000, chains = 2, cores = 2)
```

Summarise model

```{r}
print(fit05_1, probs = c(0.10, 0.5, 0.9))
```

Write another Stan model

```{stan output.var="m05_2"}
data {
  int N;
  vector[N] divorce;
  vector[N] marriage_z;
}
parameters {
  real a;
  real bM;
  real<lower=0> sigma;
}
model {
  vector[N] mu = a + marriage_z * bM;
  
  target += normal_lpdf(divorce | mu, sigma);
  target += normal_lpdf(a | 10, 10);
  target += normal_lpdf(bM | 0, 10);
}

```

Organize data for Stan and sample.

```{r}
dat <- list(
  N = NROW(d),
  divorce = d$Divorce,
  marriage_z = d$Marriage_z
)

fit05_2 <- sampling(m05_2, data = dat, iter = 1000, chains = 2, cores = 2)
```

Summarise the model

```{r}
print(fit05_2, probs = c(.1, .5, .9))
```

Setup plot for first model,

```{r}
# draw from posterior samples
post <- as.data.frame(fit05_1)

# recreate mu and simulate it with new data
f_mu <- function(x) post$a + post$bA * x
A_z_new <- seq(-3, 3)

mu <- 
  sapply(A_z_new, f_mu) %>%
  as_tibble() %>%
  rename_all(function(x) A_z_new) %>%
  mutate(Iter = row_number()) %>%
  gather(A_z, divorce, -Iter) %>%
  group_by(A_z) %>%
  mutate(hpdi_l = HDInterval::hdi(divorce, credMass = 0.8)[1],
         hpdi_h = HDInterval::hdi(divorce, credMass = 0.8)[2]) %>%
  mutate(mu = mean(divorce)) %>%
  ungroup() %>%
  mutate(A_z = as.numeric(A_z))

# plot raw data and model estimate of mu
p <- ggplot() 
p1 <- p + 
  geom_point(data = d,
             aes(MedianAgeMarriage_z, Divorce), 
             shape = 1, color = 'dodgerblue') +
  geom_ribbon(data = mu,
              aes(x = A_z, ymin = hpdi_l, ymax = hpdi_h), alpha = .1) +
  geom_line(data = mu,
            aes(x = A_z, y = mu))
```

Setup plot for second model

```{r}
# draw from posterior samples
post <- as.data.frame(fit05_2)

# recreate mu and simulate it with new data
f_mu <- function(x) post$a + post$bM * x
M_z_new <- seq(-3, 3)

mu <- 
  sapply(M_z_new, f_mu) %>%
  as_tibble() %>%
  rename_all(function(x) M_z_new) %>%
  mutate(Iter = row_number()) %>%
  gather(M_z, divorce, -Iter) %>%
  group_by(M_z) %>%
  mutate(hpdi_l = HDInterval::hdi(divorce, credMass = 0.8)[1],
         hpdi_h = HDInterval::hdi(divorce, credMass = 0.8)[2]) %>%
  mutate(mu = mean(divorce)) %>%
  ungroup() %>%
  mutate(M_z = as.numeric(M_z))

# plot raw data and model estimate of mu
p2 <- p + 
  geom_point(data = d,
             aes(Marriage_z, Divorce), 
             shape = 1, color = 'dodgerblue') +
  geom_ribbon(data = mu,
              aes(x = M_z, ymin = hpdi_l, ymax = hpdi_h), alpha = .1) +
  geom_line(data = mu,
            aes(x = M_z, y = mu))
```

Figure 5.2

```{r}
library(gridExtra)
grid.arrange(p2, p1, nrow = 1)
```

### 5.1.2. Fitting the model

Fit model using both predictors

```{stan output.var="m05_3"}
data {
  int N;
  vector[N] divorce;
  vector[N] marriage_z;
  vector[N] median_age_z;
}
parameters {
  real a;
  real bA;
  real bM;
  real<lower=0> sigma;
}
model {
  vector[N] mu = a + median_age_z * bA + marriage_z * bM;
  target += normal_lpdf(divorce | mu, sigma);
  target += normal_lpdf(a | 10, 10);
  target += normal_lpdf(bA | 0, 10);
  target += normal_lpdf(bM | 0, 10);
  target += exponential_lpdf(sigma | 1);
}
```

Organize data and sample, this time including new data with which we want to predict.

```{r}
dat = list(
  N = NROW(d),
  divorce = d$Divorce,
  marriage_z = d$Marriage_z,
  median_age_z = d$MedianAgeMarriage_z
)

fit05_3 <- sampling(m05_3, data = dat, iter = 1000, chains = 2, cores = 2)
```

Summarise model

```{r}
print(fit05_3, probs = c(0.1, 0.5, 0.9))
```

### 5.1.3. Plotting multivariate posteriors

```{stan output.var="m05_4"}
data {
  int<lower=1> N;
  vector[N] A_z;
  vector[N] M_z;
} 
parameters {
  real a;
  real b;
  real<lower=0> sigma;
}
model {
  vector[N] mu = a + A_z * b;
  target += normal_lpdf(M_z | mu, sigma);
  target += normal_lpdf(a | 0, 10);
  target += normal_lpdf(b | 0, 10);
  target += exponential_lpdf(sigma | 1);
}
```

```{r}
dat <- list(
  N = NROW(d),
  A_z = d$MedianAgeMarriage_z,
  M_z = d$Marriage_z
)

fit05_4 <- sampling(m05_4, data = dat, iter = 1000, chains = 2, cores = 2)
```

```{r}
print(fit05_4, probs = c(0.1, 0.5, 0.9))
```

#### 5.1.3.1 Predictor residual plots

```{r}
post <- as.matrix(fit05_4)

mu <- post[,"a"] + d$MedianAgeMarriage_z %*% t(post[,"b"])
mu <- rowMeans(mu)
resid <- d$Marriage_z - mu

ggplot() + 
  geom_segment(aes(x = d$MedianAgeMarriage_z, 
                   xend = d$MedianAgeMarriage_z,
                   y = mu, yend = d$Marriage_z)) +
  geom_point(data = d,
             aes(MedianAgeMarriage_z, Marriage_z), 
             shape = 21, color = 'dodgerblue', fill = 'white') +
  geom_abline(aes(slope = mean(post[,"b"]), intercept = mean(post[,"a"])))

```

#### 5.1.3.2 counterfactual plots


```{r}
# get draws for parameters
post <- as.matrix(fit05_3)

# setup new data
nd <- 
  expand.grid(median_age_z = seq(-3, 3), 
              marriage_z = seq(-3, 3)) %>% 
  as.matrix

# estimate mu
mu <- post[,1] + post[,2:3] %*% t(nd)

# get stats on mu
avg <- colMeans(mu)
hdi <- apply(mu, 2, HDInterval::hdi)

# simulate divorce rate
iter <- 1e4
y_hat <- matrix(nrow = iter, ncol = NROW(nd))
for(i in 1:NROW(nd)) y_hat[,i] <- rnorm(iter, post[,1] + post[,2:3] %*% as.matrix(nd[i,]), post[,4])

# get stats on sim
y_hat_avg <- colMeans(y_hat)
y_hat_hdi <- apply(y_hat, 2, HDInterval::hdi)

nd <- 
  as_tibble(nd) %>%
  bind_cols(avg = avg, 
            mu_hdi_l = hdi[1,], 
            mu_hdi_h = hdi[2,],
            y_hdi_l = y_hat_hdi[1,],
            y_hdi_h = y_hat_hdi[2,])
```

Setup Figure 5.5a

```{r}
p1 <- ggplot(nd, aes(x = median_age_z, y = avg, group = marriage_z)) + 
  geom_line(data = nd,
            color = 'gray90') +
  geom_ribbon(data = nd %>% filter(marriage_z == 0),
              aes(x = median_age_z, ymin = mu_hdi_l, ymax = mu_hdi_h), alpha = .1) +
  geom_ribbon(data = nd %>% filter(marriage_z == 0),
              aes(x = median_age_z, ymin = y_hdi_l, ymax = y_hdi_h), alpha = .1) +
  geom_line(data = nd %>% filter(marriage_z == 0),
            aes(x = median_age_z, y = avg)) + 
  geom_text(data = nd %>% filter(median_age_z == min(median_age_z)), 
            aes(label = marriage_z, x = median_age_z - 0.1, y = avg), size = 2) +
  geom_text(data = nd %>% filter(median_age_z == max(median_age_z)), 
            aes(label = marriage_z, x = median_age_z + 0.1, y = avg), size = 2) +
  labs(x = 'Standardized Median Age of Marriage', y = 'Divorce rate') 
```

Setup Figure 5.5b

```{r}
p2 <- ggplot(nd, aes(x = marriage_z, y = avg, group = median_age_z)) + 
  geom_line(data = nd,
            color = 'gray90') +
  geom_ribbon(data = nd %>% filter(median_age_z == 0),
              aes(x = marriage_z, ymin = mu_hdi_l, ymax = mu_hdi_h), alpha = .1) +
  geom_ribbon(data = nd %>% filter(median_age_z == 0),
              aes(x = marriage_z, ymin = y_hdi_l, ymax = y_hdi_h), alpha = .1) +
  geom_line(data = nd %>% filter(median_age_z == 0),
            aes(x = marriage_z, y = avg)) + 
  geom_text(data = nd %>% filter(marriage_z == min(marriage_z)), 
            aes(label = median_age_z, x = marriage_z - 0.1, y = avg), size = 2) +
  geom_text(data = nd %>% filter(marriage_z == max(marriage_z)), 
            aes(label = median_age_z, x = marriage_z + 0.1, y = avg), size = 2) +
  labs(x = 'Standardized Rate of Marriage', y = 'Divorce rate')
```

Figure 5.5

```{r}
grid.arrange(p2, p1, nrow = 1)
```


#### 5.1.3.3 posterior prediction plots

```{r}
# estimate mu
mu <- post[,1] + post[,2:3] %*% t(d[,14:15])

# get stats on mu
avg <- colMeans(mu)
hdi <- apply(mu, 2, HDInterval::hdi)

# simulate divorce rate
iter <- 1e4
y_hat <- matrix(nrow = iter, ncol = NROW(d[,14:15]))
for(i in 1:NROW(d[,14:15])) y_hat[,i] <- rnorm(iter, post[,1] + post[,2:3] %*% t(d[i,14:15]), post[,4])

# get stats on sim
y_hat_avg <- colMeans(y_hat)
y_hat_hdi <- apply(y_hat, 2, HDInterval::hdi)
d <- d %>% mutate(mu = avg,
                  mu_hdi_l = hdi[1,],
                  mu_hdi_h = hdi[2,],
                  y_hdi_l = y_hat_hdi[1,],
                  y_hdi_h = y_hat_hdi[2,])
```

Figure 5.6a predicted versus observed

```{r}
ggplot() + 
  geom_abline(intercept = 0, slope = 1, 
              linetype = 'dashed', color = 'gray70') +
  geom_segment(data = d,
               aes(x = Divorce, xend = Divorce, 
                   y = mu_hdi_l, yend = mu_hdi_h),
               color = 'dodgerblue') +
  geom_point(data = d,
             aes(Divorce, mu), 
             shape = 1, color = 'dodgerblue', fill = 'white') +
  labs(x = "Observed divorce rate", y = 'Estimated average divorce rate',
       subtitle = 'Observed versus\nestimated for each state')

```

Figure 5.6b

```{r}
d1 <- d %>% 
  mutate(pred_err = Divorce - mu) %>%
  mutate(Loc = factor(Loc, levels = Loc[order(Divorce - mu)] ))

ggplot(d1) +
  geom_vline(xintercept = 0, color = 'gray80') +
  geom_segment(aes(x = -6, xend = 6, y = Loc, yend = Loc), 
               linetype = 'dotted', color = 'gray90') +
  geom_segment(aes(x = Divorce - mu_hdi_h, xend = Divorce - mu_hdi_l,
                   y = Loc, yend = Loc), 
               shape = 3, color = 'black') +
  geom_point(aes(Divorce - mu, Loc), shape = 21, fill = 'white') +
  geom_point(aes(Divorce - y_hdi_l, Loc), shape = 3, color = 'gray80') +
  geom_point(aes(Divorce - y_hdi_h, Loc), shape = 3, color = 'gray80') +
  scale_x_continuous(limits = c(-6, 6)) +
  labs(x = '', y = '')
```

Figure 5.6c

```{r}
ggplot() +
  stat_smooth(data = d,
              aes(WaffleHouses/Population, Divorce - mu), method = 'lm', fullrange = T, color = 'black', alpha = .2, lwd = .4) +
  geom_point(data = d,
             aes(WaffleHouses/Population, Divorce - mu), shape = 21, color = 'dodgerblue') +
  geom_text(data = filter(d, Loc %in% c('AL', 'AR', 'GA', 'OK', 'ME', 'NJ', 'SC')),
             aes(x = WaffleHouses/Population + 1.2, y = Divorce - mu, label = Loc), size = 3) +
  labs(x = 'Waffles per capita', y = 'Divorce error')
```