Notebook_Overdispersion.Rmd

---
title: "The longitudinal dynamics and natural history of clonal haematopoiesis"
subtitle: "Modelling the technical overdispersion"
author: 
    - José Guilherme de Almeida
    - Moritz Gerstung
output: 
  html_document:
    theme: lumen
    highlight: tango
    code_folding: hide
    df_print: paged
    toc: true
    toc_float: true
    toc_collapsed: true
    toc_depth: 4
    number_sections: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r import_functions, include=FALSE}
dir.create("figures",showWarnings = F)
dir.create("figures/overdispersion/",showWarnings = F)
source("Scripts/vaf_dynamics_functions.R")
set.seed(42)
source("Scripts/prepare_data.R")
```

# Modelling the technical overdispersion

Technical replicate data shows some level of technical overdispersion. This can be modelled in our current model by changing our binomial modelling of the counts to a beta binomial, with a fixed $\beta$ parameter and an $\alpha$ parameter that depends on $\beta$ and on the probability that is inferred by modelling, particularly by using the formulation for the expected value of the beta distribution $E[X] = \frac{\alpha}{\alpha+\beta}$. 

Considering the above, we can model the counts as $counts \sim BB(n =  coverage,alpha = \frac{\beta p}{1 - p},beta = \beta)$

```{r visualising overdispersion,fig.height=2,fig.width=2.5}
one_hot_encode <- function(factor_vector) {
  factor_levels <- sort(as.character(unique(factor_vector)))
  print(factor_levels)
  output <- sapply(
    factor_vector,
    function(x) {
      as.numeric(factor_levels == x)
    }
  )
  return(t(output))
}

tru_q_data <- read.table('data/TruQ.txt',header = T) %>%
  mutate(low = qbeta(0.05,MUTcount,TOTALcount - MUTcount),
         high = qbeta(0.95,MUTcount,TOTALcount - MUTcount))
replicate_data <- read.table('data/Replicates_all.txt',header = T) %>%
  mutate(VAF = ifelse(CHR == "X", VAF / 2,VAF),
         MUTcount = ifelse(CHR == "X", floor(MUTcount / 2),MUTcount))
ages <- merge(full_data,replicate_data,by = c("SardID","Phase")) %>%
  select(Age, SardID, Phase) %>%
  unique
replicated_data_multiple <- replicate_data %>%
  group_by(SardID,variant) %>%
  summarise(count = length(unique(Phase))) %>%
  subset(count > 1)
replicate_data <- merge(
  replicate_data,
  ages,
  by = c("SardID","Phase"),
  all = T
) %>% 
  subset(SardID %in% replicated_data_multiple$SardID & 
           variant %in% replicated_data_multiple$variant) %>% 
  rowwise() %>% 
  mutate(position = str_split(variant,'_')[[1]][2],
         change = gsub('_','>',paste(str_split(variant,'_')[[1]][3:4],collapse='_'))) %>%
  mutate(l = sprintf('%s (%s)\nSardID=%s',position,change,SardID)) %>%
  mutate(low = qbeta(0.05,MUTcount,TOTALcount - MUTcount),
         high = qbeta(0.95,MUTcount,TOTALcount - MUTcount))

tru_q_data %>% 
  ggplot(aes(x = VAF_expected,y = VAF_observed,group = Replicate)) + 
  geom_errorbar(aes(ymin = low,ymax = high,colour = as.factor(Replicate)),
                size = 0.25,width = 0.001,
                position = position_dodge(width = 0.0025)) + 
  geom_point(aes(colour = as.factor(Replicate)),size = 0.5,
             position = position_dodge(width = 0.0025)) + 
  theme_gerstung(base_size = 6) +
  facet_wrap(~ Gene) + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom',legend.key.size = unit(0.1,"cm")) + 
  xlab("Expected VAF") +
  ylab("Observed VAF") + 
  scale_x_continuous(labels = function(x) sprintf('%s%%',x*100)) +
  scale_y_continuous(labels = function(x) sprintf('%s%%',x*100)) +
  ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_truq.pdf",
         height=2,width=2.5)
```

```{r,fig.height=3.5,fig.width=4.5}
replicate_data %>% 
  mutate(low = ifelse(low == 0,1e-8,low)) %>% 
  ggplot(aes(x = as.factor(Phase),y = VAF,colour = as.factor(Replicate))) + 
  geom_point(size = 0.5,position = position_dodge(width = 0.3)) + 
  geom_linerange(aes(ymin = low,ymax = high),size = 0.25,
                 position = position_dodge(width = 0.3)) + 
  theme_gerstung(base_size = 6) +
  facet_wrap(~ reorder(l,SardID),scale = 'free_y') + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom',legend.key.size = unit(0.1,"cm")) + 
  xlab("Phase") +
  ylab("VAF") + 
  coord_cartesian(ylim = c(0.001,NA)) +
  scale_y_continuous(labels = function(x) sprintf('%s%%',x*100),trans = 'log10') +
  ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_sard.pdf",
         height=3.5,width=4.5)
```

The invervals here are estimated using a beta distribution ($Beta(alpha = count+1,beta = coverage - count + 1)$).

## Modelling the overdispersion for TruQ data

```{r,fig.height=3,fig.width=6}
VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount

beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
  rate = beta_rate
)

distribution(VAF_expected) <- beta(
  shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
  shape2 = beta_variable
)

m <- model(beta_variable,beta_rate)

draws <- mcmc(m,
              sampler = hmc(Lmin = 10,Lmax = 20),
              warmup = 2000,
              n_samples = 2000)

mcmc_trace(draws)

technical_beta_values <- calculate(beta_variable,draws) %>%
  do.call(what = rbind) %>%
  variable_summaries()

technical_dispersion_rate_values <- calculate(beta_rate,draws) %>%
  do.call(what = rbind) %>%
  variable_summaries()

replicate_data_samples <- replicate_data %>%
  apply(1,FUN = function(x) {
    size <- as.numeric(x[12]) + 1
    count <- as.numeric(x[10]) + 1
    out <- data.frame(
      extraDistr::rbbinom(n = 1000,size = size,
                          alpha = ((count/size) * technical_beta_values[1,1]) / (1 - count/size),
                          beta = technical_beta_values[1,1])) / size
    out$SardID <- x[1]
    out$Phase <- x[2]
    out$Replicate <- x[3]
    out$variant <- x[4]
    out$Gene <- x[5]
    out$Age <- x[14]
    out$VAF <- x[13]
    return(out)
  }
  ) %>%
  do.call(what = rbind)
colnames(replicate_data_samples)[1] <- "Samples"
replicate_data_samples[,c(7,8)] <- apply(replicate_data_samples[,c(7,8)],2,as.numeric)

replicate_data_samples %>% 
  ggplot(aes(x = as.numeric(Age),colour = Replicate)) + 
  geom_point(aes(y = Samples),
             alpha = 0.1,
             size = 0.1,
             position = "jitter") + 
  geom_point(aes(y = VAF),
             size = 0.5,
             alpha = 0.9) +
  theme_minimal(base_size = 6) +
  facet_wrap(~ paste(Gene,variant,SardID,sep = '-'),scale = 'free') + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom') + 
  xlab("Phase") +
  ylab("VAF")
```

## Estimating $\beta$ with technical replicates

Using technical replicates for known DNA quantities ($truQ$) and for actual patient data ($SR$), we can posit the following model:

$$
p_{truQ} = observed\ VAF_{truQ} + 1/coverage_{truQ}
\\
\beta ~ \sim exponential(0.001)
\\
\alpha_{truQ} = \frac{\beta p_{truQ}}{1 - p_{truQ}}
\\
expected\ VAF_{truQ} + 1/coverage_{truQ} \sim Beta(alpha,beta)
$$

We need to adjust the expected and observed VAFs with the equivalent of having one count due to the fact that beta distributions only permit $\alpha>0$ and $\beta>0$.

$$
p_{SR} = growth\ coefficient = effect_{gene} + individual_{offset}
\\
\alpha_{SR} = \frac{\beta p_{SR}}{1 - p_{SR}}
\\
observed\ counts_{SR} \sim BB(coverage + 1,alpha,beta)
$$

Here, $p_{SR}$ is parametrized as $p_{SR} = ilogit((b_{gene_{i}} + b_{c_{ji}}) * t + u_{ji})$. 

For this subsubsection, the model is specified in `scripts/estimating_overdispersion.R`.

We used this model to infer the $\mu_{\beta}$ and $\sigma_{\beta}$ mentioned in the model specification above.

```{r,fig.height=4,fig.width=4.5}
VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount
Sard_count <- replicate_data$MUTcount 
Sard_cover <- replicate_data$TOTALcount
n_clones <- length(unique(paste(replicate_data$SardID,replicate_data$variant,sep = '-')))

clone_b <- normal(0,sqrt(sum(0.1^2,0.1^2)),dim = c(n_clones,1))
u <- uniform(min=-50,max=0,dim = c(n_clones,1))
individual_ind <- one_hot_encode(paste(replicate_data$SardID,replicate_data$variant,sep = '-'))

offset <- individual_ind %*% u
gene_effect <- individual_ind %*% clone_b * replicate_data$Age#(replicate_data$Age - min(replicate_data$Age))
r <- gene_effect + offset
mu <- ilogit(r) * 0.5

beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
  rate = beta_rate
)

distribution(VAF_expected) <- beta(
  shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
  shape2 = beta_variable
)

distribution(Sard_count) <- beta_binomial(
  size = Sard_cover,
  alpha = (mu * beta_variable) / (1 - mu),
  beta = beta_variable
)

m <- model(beta_variable,beta_rate,clone_b,u)

draws <- mcmc(m,
              sampler = hmc(Lmin = 400,Lmax = 500),
              warmup = 2000,
              n_samples = 1000,
              n_cores = 4)

mcmc_trace(draws)

beta_values <- calculate(beta_variable,draws) %>%
  lapply(
    variable_summaries
  )
mu_values <- calculate(mu,draws) %>% 
  lapply(
    variable_summaries
  )

Values <- list(
  size = Sard_cover,
  mu = colMeans(do.call(rbind,calculate(mu,draws))),
  beta = variable_summaries(do.call(rbind,calculate(beta_variable,draws)))
)
beta_parameter <- Values$beta[1,1]
parameter_df <- data.frame(
  cover = Sard_cover,
  alpha = (Values$mu * beta_parameter) / (1 - Values$mu),
  beta = beta_parameter
)
samples <- parameter_df %>% 
  apply(
    1,function(x){
      S <- extraDistr::rbbinom(n = 1000,size = x[1],alpha = x[2],beta = x[3])
      if (sum(is.na(S)) > 0) {
        print(x)
      }
      S <- S / x[1]
      return(quantile(S,c(0.05,0.50,0.95),na.rm = T))
    }) %>% 
  t %>%
  as.data.frame()
colnames(samples) <- c("Q_005","Q_050","Q_095")

prediction_matrix <- data.frame(
  pred = samples$Q_050,
  pred_005 = samples$Q_005,
  pred_095 = samples$Q_095,
  true = (Sard_count)/Values$size,
  prob = Values$mu,
  variant = replicate_data$variant,
  Gene = replicate_data$Gene,
  Age = replicate_data$Age,
  SardID = replicate_data$SardID,
  Replicate = replicate_data$Replicate,
  l = replicate_data$l
) 

prediction_matrix %>% 
  gather(key = 'key',value = 'value',true,prob) %>%
  mutate(key = ifelse(key == 'true','Real data','Inferred trajectory')) %>% 
  ggplot(aes(x = Age,y = value,linetype = key)) + 
  geom_point(size = 0.5) +
  geom_linerange(aes(ymin = pred_005,ymax = pred_095),size = 0.25) +
  geom_line(aes(group = paste(Replicate,key)),size = 0.25) +
  theme_minimal(base_size = 6) +
  scale_linetype_manual(values = c(1,3),name = NULL) +
  facet_wrap(~ reorder(l,SardID),scales = 'free') + 
  ylab("VAF") +
  scale_y_continuous(labels = function(x) sprintf('%s%%',x*100), trans = 'log10') +
  theme(legend.position = "bottom",legend.key.height = unit(0.1,"cm")) + 
  ggsave("figures/overdispersion/example-trajectories.pdf",height=3.5,width = 3.5)

Values$beta %>%
  saveRDS("models/overdispersion.RDS")
```

```{r,fig.height=1.5,fig.width=2.5}
mcmc_trace(draws,regex_pars = 'beta_variable') + 
  theme_gerstung(base_size = 6) + 
  ylab("Technical overdispersion (beta)") + 
  xlab("Sample index") + 
  theme(legend.key.height = unit(0.1,"cm")) + 
  ggsave("figures/overdispersion/beta-chain.pdf",height=1.5,width = 2.5)
```