Merge pull request #60 from ScottClaessens/48-function-to-predict-an-…

…evolutionary-time-series-given-some-initial-values 48 function to predict an evolutionary time series given some initial values
ScottClaessens · Oct 9, 2024 · d8766d4 · d8766d4
2 parents 15197c7 + 2939e5a
commit d8766d4
Show file tree

Hide file tree

Showing 8 changed files with 691 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,8 +29,10 @@ Imports:
     bayesplot,
     cmdstanr,
     colorspace,
+    cubelyr,
     dplyr,
     ggplot2,
+    Matrix,
     methods,
     phangorn,
     phaseR,

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,8 +13,10 @@ export(coev_make_stancode)
 export(coev_make_standata)
 export(coev_plot_delta_theta)
 export(coev_plot_flowfield)
+export(coev_plot_pred_series)
 export(coev_plot_predictive_check)
 export(coev_plot_selection_gradient)
+export(coev_pred_series)
 export(coev_simulate_coevolution)
 export(extract_samples)
 export(stancode)

diff --git a/R/coev_plot_pred_series.R b/R/coev_plot_pred_series.R
@@ -0,0 +1,172 @@
+#' Plot a predicted coevolutionary time series from a fitted \code{coevfit}
+#' object
+#'
+#' This function plots a predicted coevolutionary time series using the
+#' estimated parameters from a fitted \code{coevfit} model. By default, the
+#' plot uses the posterior ancestral states estimated by the model as the
+#' starting values, but users can also set their own starting values for
+#' traits. Plots can be generated with or without stochastic drift. For more
+#' details on the underlying predictive function, see
+#' \code{help(coev_pred_series)}.
+#'
+#' @param object An object of class \code{coevfit}
+#' @param prob A value between 0 and 1 indicating the desired probability
+#'   to be covered by the uncertainty intervals. The default is 0.95.
+#' @param ... Additional arguments passed to \code{\link{coev_pred_series}}
+#'
+#' @return A ggplot object.
+#'
+#' @author Scott Claessens \email{scott.claessens@@gmail.com}, Erik Ringen
+#'   \email{erikjacob.ringen@@uzh.ch}
+#'
+#' @seealso \code{\link{coev_pred_series}}
+#'
+#' @examples
+#' \dontrun{
+#' # fit dynamic coevolutionary model
+#' fit <- coev_fit(
+#'   data = authority$data,
+#'   variables = list(
+#'     political_authority = "ordered_logistic",
+#'     religious_authority = "ordered_logistic"
+#'   ),
+#'   id = "language",
+#'   tree = authority$phylogeny,
+#'   # additional arguments for cmdstanr::sample()
+#'   chains = 4,
+#'   parallel_chains = 4,
+#'   seed = 1
+#'   )
+#'
+#' # simulated trait co-evolution
+#' coev_plot_pred_series(
+#'   object = fit,
+#'   stochastic = TRUE
+#'   )
+#'
+#' # expected trait co-evolution, no drift
+#' coev_plot_pred_series(
+#'   object = fit,
+#'   stochastic = FALSE
+#'   )
+#' }
+#'
+#' @export
+coev_plot_pred_series <- function(object, prob = 0.95, ...) {
+  # stop if object not of class coevfit
+  if (!methods::is(object, "coevfit")) {
+    stop2(
+      paste0(
+        "Argument 'object' must be a fitted coevolutionary model ",
+        "of class coevfit."
+        )
+      )
+  }
+  # stop if prob is outside of range 0 - 1
+  if (prob <= 0 | prob >= 1) {
+    stop2("Argument 'prob' is not between 0 and 1.")
+  }
+  # user-supplied arguments
+  user_args <- list(...)
+  # default arguments from coev_pred_series
+  default_args <- as.list(formals(coev_pred_series))
+  combined_args <- utils::modifyList(default_args, user_args)
+  combined_args$object <- object
+  preds <- do.call(coev_pred_series, combined_args)
+  # create plot
+  if (combined_args$stochastic == FALSE) {
+      # get CI quantiles
+      probs <- c(((1 - prob) / 2), 1 - ((1 - prob) / 2))
+      # get expected values in cube format
+      epreds_long <-
+        preds |>
+        cubelyr::as.tbl_cube(
+          met_name = "est"
+          ) |>
+        as.data.frame()
+      # summarise expected values
+      epreds_summary <-
+        epreds_long |>
+        dplyr::group_by(.data$response, .data$time) |>
+        dplyr::summarise(
+          mean = mean(.data$est),
+          lower_CI = stats::quantile(.data$est, probs[1]),
+          upper_CI = stats::quantile(.data$est, probs[2])
+      )
+      # plot
+      p <-
+        ggplot2::ggplot(
+          epreds_summary,
+          ggplot2::aes(
+            x = .data$time,
+            y = .data$mean,
+            fill = .data$response,
+            color = .data$response,
+            linetype = .data$response
+            )
+          ) +
+      ggplot2::geom_line(lwd = 1) +
+      ggplot2::geom_ribbon(
+        ggplot2::aes(
+          ymin = .data$lower_CI,
+          ymax = .data$upper_CI
+          ),
+        alpha = 0.25,
+        lwd = 0.25
+        ) +
+      ggplot2::theme_classic(base_size = 14) +
+      ggplot2::scale_x_continuous(
+        breaks = c(0, max(epreds_summary$time)),
+        labels = c("LCA", "present")
+        ) +
+      ggplot2::ylab("trait value (latent scale)") +
+      ggplot2::xlab("time") +
+      ggplot2::theme(
+        legend.title = ggplot2::element_blank(),
+        strip.background = ggplot2::element_blank(),
+        strip.text = ggplot2::element_blank()
+        ) +
+      ggplot2::ggtitle("Expected trait coevolution")
+  } else if (combined_args$stochastic == TRUE) {
+    # get predictions
+    sims_long <-
+      preds |>
+      cubelyr::as.tbl_cube(met_name = "est") |>
+      as.data.frame() |>
+      dplyr::mutate(
+        sim = factor(
+          paste("sim", .data$samps),
+          levels = paste("sim", sort(unique(.data$samps)))
+          )
+        )
+    # plot
+    p <-
+      sims_long |>
+      dplyr::filter(.data$samps <= 15) |>
+      ggplot2::ggplot(
+        ggplot2::aes(
+          x = .data$time,
+          y = .data$est,
+          color = .data$response
+          )
+        ) +
+    ggplot2::facet_wrap( ~ .data$sim, ncol = 5) +
+    ggplot2::geom_line(lwd = 1) +
+    ggplot2::theme_minimal(base_size = 14) +
+    ggplot2::scale_x_continuous(
+      breaks = c(0, max(sims_long$time)),
+      labels = c("LCA", "present")
+      ) +
+    ggplot2::ylab("trait value (latent scale)") +
+    ggplot2::xlab("time") +
+    ggplot2::theme(
+      legend.title = ggplot2::element_blank(),
+      strip.background = ggplot2::element_blank(),
+      panel.spacing.x = ggplot2::unit(1, "lines"),
+      axis.text.x = ggplot2::element_blank(),
+      axis.ticks.x = ggplot2::element_blank()
+      ) +
+    ggplot2::ggtitle("Predicted trait coevolution")
+  }
+  return(p)
+}
diff --git a/R/coev_pred_series.R b/R/coev_pred_series.R
@@ -0,0 +1,186 @@
+#' Predict a co-evolutionary time series from a fitted \code{coevfit} object
+#'
+#' This function produces predicted values of traits over a time series, given
+#' the estimated parameters for a \code{coevfit} model. This function is used
+#' under the hood by the plotting function \code{\link{coev_pred_series}}.
+#'
+#' @param object An object of class \code{coevfit}
+#' @param eta_anc If \code{NULL} (default), the starting values for the latent
+#'   states \eqn{\eta} will be set to the estimated ancestral states at the
+#'   root from the \code{coevfit} model. Otherwise, a named numeric vector with
+#'   length equal to the number of variables specifying the initial \eqn{\eta}
+#'   values. All variable names must be included in this vector.
+#' @param tmax A positive number indicating the total duration of time for which
+#'   to predict. Set to 1 by default, corresponding to the entire time depth of
+#'   the original phylogenetic tree(s).
+#' @param ntimes A positive integer indicating the total number of discrete time
+#'   steps over which to make predictions. Each step corresponds to a time
+#'   difference of dt = tmax/ntimes. Set to 30 by default.
+#' @param ndraws An integer indicating the number of draws to return. The
+#'   default and maximum number of draws is the size of the posterior sample.
+#' @param stochastic Logical (defaults to \code{FALSE}); indicator of whether
+#'   predictions should include only the expected co-evolutionary change
+#'   due to deterministic selection (FALSE) or also stochastic drift (TRUE).
+#'
+#' @return An \[ndraw, ntimes, nvariables\] array of predicted \eqn{\eta}
+#'   values.
+#'
+#' @author Scott Claessens \email{scott.claessens@@gmail.com}, Erik Ringen
+#'   \email{erikjacob.ringen@@uzh.ch}
+#'
+#' @seealso \code{\link{coev_plot_pred_series}}
+#'
+#' @examples
+#' \dontrun{
+#' # fit dynamic coevolutionary model
+#' fit <- coev_fit(
+#'   data = authority$data,
+#'   variables = list(
+#'     political_authority = "ordered_logistic",
+#'     religious_authority = "ordered_logistic"
+#'   ),
+#'   id = "language",
+#'   tree = authority$phylogeny,
+#'   # additional arguments for cmdstanr::sample()
+#'   chains = 4,
+#'   parallel_chains = 4,
+#'   seed = 1
+#'   )
+#'
+#' # simulate trait co-evolution
+#' sims <- coev_pred_series(
+#'   object = fit,
+#'   stochastic = TRUE
+#'   )
+#'
+#' # expected trait co-evolution, no drift
+#' epreds <- coev_pred_series(
+#'   object = fit,
+#'   stochastic = FALSE
+#'   )
+#' }
+#'
+#' @export
+coev_pred_series <- function(object, eta_anc = NULL, tmax = 1, ntimes = 30,
+                             ndraws = NULL, stochastic = FALSE) {
+  # stop if object is not of class coevfit
+  if (!methods::is(object, "coevfit")) {
+    stop2(
+      paste0(
+        "Argument 'object' must be a fitted coevolutionary model of class ",
+        "coevfit."
+        )
+      )
+  }
+  if (!is.null(eta_anc)) {
+    # stop if eta_anc is neither NULL nor
+    # a named vector of length equal to the number of variables
+    if (!is.numeric(eta_anc) | length(eta_anc) != length(object$variables)) {
+        stop2(
+          paste0(
+            "Argument 'eta_anc' must be numeric and equal in length to the ",
+            "number of variables."
+          )
+        )
+    }
+    # stop if eta_anc is not a named vector
+    if (is.null(names(eta_anc))) {
+      stop2("Argument 'eta_anc' is not a named vector.")
+    }
+    # stop if eta_anc does not have names equal to variables in the model
+    if (!identical(sort(names(eta_anc)), sort(names(object$variables)))) {
+      stop2(
+        paste0(
+          "Argument 'eta_anc' has names different to the variables included ",
+          "in the model."
+        )
+      )
+    }
+  }
+  # stop if tmax is not numeric or not positive
+  if (!is.numeric(tmax) | length(tmax) != 1) {
+    stop2("Argument 'tmax' must be a single numeric value.")
+  } else if (tmax <= 0) {
+    stop2("Argument 'tmax' must be positive.")
+  }
+  # stop if ndraws is not a single integer between 1 and the total num draws
+  if (!is.null(ndraws)) {
+    if (!is.integer(ndraws) | length(ndraws) != 1) {
+      stop2("Argument 'ndraws' must be a single integer.")
+    } else if (ndraws < 1 | ndraws > nrow(object$fit$draws())) {
+      stop2(
+        "Argument 'ndraws' must be between 1 and the total number of draws."
+        )
+    }
+  }
+  # stop if stochastic not logical
+  if (!is.logical(stochastic)) {
+    stop2("Argument 'stochastic' must be logical.")
+  }
+  # get posterior samples and number of variables
+  post <- extract_samples(object)
+  J <- length(object$variables)
+  # get eta_anc in the correct order
+  eta_anc <- eta_anc[names(object$variables)]
+  # get number of samples
+  nsamps <- ifelse(is.null(ndraws), length(post$lp__), ndraws)
+  # initialise empty array for predictions
+  preds <-
+    array(
+      NA,
+      dim = c(nsamps, ntimes + 1, J),
+      dimnames = list(
+        samps = 1:nsamps,
+        time = 1:(ntimes + 1),
+        response = names(object$variables)
+        )
+      )
+  # if user defined ancestral states
+  # otherwise, use model inferred ancestral states
+  if (!is.null(eta_anc)) {
+    for (i in 1:nsamps) preds[i, 1, ] = eta_anc
+  } else {
+    # set posterior ancestral states
+    eta_anc_long <- post$eta_anc
+    ntrees <- dim(eta_anc_long)[2]
+    # shuffle dimensions of tree dimension to get random draws
+    eta_anc_long <-
+      eta_anc_long[, sample(1:ntrees, size = ntrees, replace = FALSE), ]
+    if (ntrees > 1) {
+      # make into 2d matrix by stacking trees
+      eta_anc_long2 <- eta_anc_long[, 1, ]
+      for (t in 2:ntrees) {
+        eta_anc_long2 <- rbind(eta_anc_long2, eta_anc_long[,t,])
+        eta_anc_long <- eta_anc_long2
+      }
+    }
+    for (i in 1:nsamps) {
+      preds[i, 1, ] = eta_anc_long[i, ]
+    }
+  }
+  # calculate predictions
+  for (i in 1:nsamps) {
+    # selection parameters
+    A <- post$A[i,,]
+    A_delta <- as.matrix(Matrix::expm(A * tmax / ntimes))
+    inv_A <- solve(A)
+    I <- diag(rep(1, J))
+    b <- post$b[i,]
+    # drift parameters
+    Q_inf <- post$Q_inf[i,,]
+    VCV <- Q_inf - ((A_delta) %*% Q_inf %*% t(A_delta))
+    chol_VCV <- t(chol(VCV))
+    # calculate predictions over time
+    for (t in 1:ntimes) {
+      preds[i, t + 1, ] <- (A_delta %*% preds[i, t, ] +
+                              (inv_A %*% (A_delta - I) %*% b))[,1]
+      # add drift
+      if (stochastic == TRUE) {
+        preds[i, t + 1, ] <-
+          preds[i, t + 1, ] + (chol_VCV %*% stats::rnorm(J, 0, 1))
+      }
+    }
+  }
+  # shuffle draws to de-correlate when doing simulations
+  preds[sample(1:nsamps, size = nsamps, replace = FALSE), , ]
+}