hta-pharma · hoppanda · Jul 21, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 28, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,7 +14,8 @@ Depends:
 Imports:
  graphics,
  stats,
- survival
+ survival,
+ MASS
 Suggests:
  knitr,
  testthat (>= 2.0)

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,13 +2,15 @@
 
 export(bucher)
 export(center_ipd)
+export(check_weights)
+export(dummize_ipd)
 export(estimate_weights)
+export(generate_survival_data)
 export(log_cum_haz_plot)
 export(maic_tte_unanchor)
 export(medSurv_makeup)
 export(plot_weights)
 export(process_agd)
-export(process_ipd)
 export(resid_plot)
 export(survfit_makeup)
 import(stats)

diff --git a/R/data_generation.R b/R/data_generation.R
@@ -0,0 +1,74 @@
+#' Generate time-to-event data
+#'
+#' Function generates time-to-event data. We follow the data-generating mechanism from
+#' Bender et al. and Remiro-Azocar et al. 2021 to simulate Weibull-distributed survival times
+#' under a proportional hazards parameterization. As per Remiro-Azocar et al, we set the
+#' default scale parameter (lambda) to 8.5 and shape parameter (nu) to 1.3 to reflect frequently
+#' observed mortality trends in metastatic cancer patients. Censoring times are generated from
+#' exponential distribution where the rate parameter is 0.96 is selected to achieve a censoring rate
+#' of 35% under the active treatment at baseline (with the values of the covariates set to zero).
+#'
+#' @param N sample size
+#' @param ncov number of covariates to generate
+#' @param lambda scale parameter in baseline hazard function
+#' @param nu shape parameter in baseline hazard function
+#' @param censor_rate rate parameter of the exponential distribution of censoring times
+#' @param cor correlation between covariates
+#' @param marginal_mean marginal mean of covariates
+#' @param marginal_sd marginal standard deviation of covariates
+#' @param beta vector of coefficients for prognostic factors. Defaults is `-log(0.67)` for all covariates.
+#' @param gamma vector of coefficients for effect modifiers. Default is `0` for half of the covariates and
+#' `-log(0.67)` for the others.
+#' @param d0 coefficient for the treatment effect for patients with `treat = 0`
+#' @param seed seed used for simulation reproducibility
+#'
+#' @return data.frame with survival time, censor (event) status, treatment indicator, and covariates
+#'
+#' @references Remiro-Azocar et al. 2021
+#'
+#' @export
+
+generate_survival_data <- function(N = 150, ncov = 4, lambda = 8.5, nu = 1.3, censor_rate = 0.96,
+ cor = 0.35, marginal_mean = 0.6, marginal_sd = 0.2,
+ beta = rep(-log(0.67), ncov),
+ gamma = sort(rep(c(0, -log(0.67)), length = ncov)),
+ d0 = log(0.25), seed = 1) {
+ set.seed(seed)
+
+ # if beta and gamma is not specified, give it default values
+ if (is.null(beta)) {
+ beta <- rep(-log(0.67), ncov)
+ }
+
+ if (is.null(gamma)) {
+ gamma <- rep(-log(0.67), ncov)
+ gamma[1:round(ncov / 2)] <- 0
+ }
+
+ # generate ncov covariates (X) using a multivariate normal
+ sigma <- matrix(cor, nrow = ncov, ncol = ncov)
+ diag(sigma) <- 1
+ sigma <- sigma * marginal_sd^2
+
+ covariates <- MASS::mvrnorm(N, mu = rep(marginal_mean, ncov), Sigma = sigma)
+
+ treat <- sample(x = c(0, 1), size = N, replace = TRUE, prob = c(0.5, 0.5))
+
+ # linear predictor
+ lp <- covariates %*% beta + covariates %*% gamma * treat + d0 * treat
+
+ # Weibull latent event times
+ u <- runif(n = N)
+ latent_time <- (-log(u) / (lambda * exp(lp)))^(1 / nu)
+
+ # exponential censoring times
+ censor_time <- rexp(n = N, rate = censor_rate)
+
+ # follow-up times and event indicators
+ time <- pmin(latent_time, censor_time)
+ event <- as.numeric(latent_time <= censor_time)
+
+ colnames(covariates) <- paste0("x", seq_len(ncov))
+ survival_data <- data.frame(USUBJID = seq_len(N), time = time, event = event, treat = treat, x = covariates)
+ return(survival_data)
+}
diff --git a/R/matching.R b/R/matching.R
@@ -4,55 +4,60 @@
 
 #' Derive individual weights in the matching step of MAIC
 #'
-#' Assuming data is properly processed, this function takes individual patient data (IPD) with centered effect modifiers as input,
-#' and generates weights for each individual in IPD trial that matches the chosen statistics of those effect modifiers in Aggregated Data (AgD) trial.
+#' Assuming data is properly processed, this function takes individual patient data (IPD) with centered covariates
+#' (effect modifiers and/or prognostic variables) as input, and generates weights for each individual in IPD trial that
+#' matches the chosen statistics of those covariates in Aggregated Data (AgD) trial.
 #'
-#' @param data a numeric matrix, centered effect modifiers of IPD, no missing value in any cell is allowed
-#' @param centered_colnames a character or numeric vector, column indicators of centered effect modifiers, by default NULL meaning all columns in \code{data} are effect modifiers
-#' @param startVal a scalar, the starting value for all coefficients of the propensity score regression
+#' @param data a numeric matrix, centered covariates of IPD, no missing value in any cell is allowed
+#' @param centered_colnames a character or numeric vector (column indicators) of centered covariates
+#' @param start_val a scalar, the starting value for all coefficients of the propensity score regression
 #' @param method a string, name of the optimization algorithm (see 'method' argument of \code{base::optim()}).
 #' The default is `"BFGS"`, other options are `"Nelder-Mead"`, `"CG"`, `"L-BFGS-B"`, `"SANN"`, and `"Brent"`
 #' @param ... all other arguments from \code{base::optim()}
 #'
 #' @return a list with the following 4 elements,
 #' \describe{
 #' \item{data}{a data.frame, includes the input \code{data} with appended column 'weights' and 'scaled_weights'.
-#' Scaled weights has a summation to be the number of rows in \code{data} that has no missing value in any of the effect modifiers}
-#' \item{centered.colnames}{column names of centered effect modifiers in \code{data}}
-#' \item{nr_missing}{number of rows in \code{data} that has at least 1 missing value in specified centered effect modifiers}
+#' Scaled weights has a summation to be the number of rows in \code{data} that has no missing value in any of the
+#' effect modifiers}
+#' \item{centered_colnames}{column names of centered effect modifiers in \code{data}}
+#' \item{nr_missing}{number of rows in \code{data} that has at least 1 missing value in specified centered effect
+#' modifiers}
 #' \item{ess}{effective sample size, square of sum divided by sum of squares}
 #' \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details}
 #' }
 #' @export
 #'
 #' @examples
-estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, method = "BFGS", ...) {
+estimate_weights <- function(data, centered_colnames = NULL, start_val = 0, method = "BFGS", ...) {
  # pre check
  ch1 <- is.data.frame(data)
- if(!ch1) stop("'data' is not a data.frame")
+ if (!ch1) stop("'data' is not a data.frame")
 
  ch2 <- (!is.null(centered_colnames))
- if(ch2 & is.numeric(centered_colnames)){
- ch2b <- any(centered_colnames<1|centered_colnames>ncol(data))
- if(ch2b) stop("specified centered_colnames are out of bound")
- }else if(ch2 & is.character(centered_colnames)){
- ch2b <- !all(centered_colnames%in%names(data))
- if(ch2b) stop("1 or more specified centered_colnames are not found in 'data'")
- }else{
+ if (ch2 && is.numeric(centered_colnames)) {
+ ch2b <- any(centered_colnames < 1 | centered_colnames > ncol(data))
+ if (ch2b) stop("specified centered_colnames are out of bound")
+ } else if (ch2 && is.character(centered_colnames)) {
+ ch2b <- !all(centered_colnames %in% names(data))
+ if (ch2b) stop("1 or more specified centered_colnames are not found in 'data'")
+ } else {
  stop("'centered_colnames' should be either a numeric or character vector")
  }
 
- ch3 <- sapply(1:length(centered_colnames), function(ii){
-  !is.numeric(data[,centered_colnames[ii]])
+ ch3 <- sapply(seq_along(centered_colnames), function(ii) {
+ !is.numeric(data[, centered_colnames[ii]])
  })
- if(any(ch3)) stop(paste0("following columns of 'data' are not numeric for the calculation:",paste(which(ch3),collapse = ",")))
+ if (any(ch3)) {
+ stop(paste0("following columns of 'data' are not numeric for the calculation:", paste(which(ch3), collapse = ",")))
+ }
 
  # prepare data for optimization
- if(is.null(centered_colnames)) centered_colnames <- 1:ncol(data)
- EM <- data[ , centered_colnames, drop=FALSE]
- ind <- apply(EM,1,function(xx) any(is.na(xx)))
+ if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
+ EM <- data[, centered_colnames, drop = FALSE]
+ ind <- apply(EM, 1, function(xx) any(is.na(xx)))
  nr_missing <- sum(ind)
- EM <- as.matrix(EM[!ind,,drop=FALSE])
+ EM <- as.matrix(EM[!ind, , drop = FALSE])
 
  # objective and gradient functions
  objfn <- function(alpha, X) {
@@ -64,7 +69,7 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho
 
  # estimate weights
  opt1 <- optim(
- par = rep(startVal, ncol(EM)),
+ par = rep(start_val, ncol(EM)),
  fn = objfn, gr = gradfn,
  X = EM,
  method = method,
@@ -81,7 +86,7 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho
  data$scaled_weights <- NA
  data$scaled_weights[!ind] <- wt_rs
 
- if(is.numeric(centered_colnames)) centered_colnames <- names(data)[centered_colnames]
+ if (is.numeric(centered_colnames)) centered_colnames <- names(data)[centered_colnames]
 
  # Output
  list(
@@ -96,19 +101,23 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho
 
 #' Plot MAIC weights in a histogram with key statistics in legend
 #'
-#' Generates a plot given the individuals weights with key summary in top right legend that
-#' includes median weight, effective sample size (ESS), reduction percentage (what percent
-#' ESS takes up in the original sample size),
-#'
+#' Generates a plot given the individuals weights with key summary in top right legend that includes
+#' median weight, effective sample size (ESS), and reduction percentage (what percent ESS is reduced from the
+#' original sample size).
+#' There are two options of weights provided in \code{\link{estimate_weights}}: unscaled or scaled.
+#' Scaled weights are relative to the original unit weights of each individual.
+#' In other words, a scaled weight greater than 1 means that an individual carries more weight in the
+#' re-weighted population than the original data and a scaled weight less than 1 means that an individual carries
+#' less weight in the re-weighted population than the original data.
 #'
-#' @param wt a numeric vector of individual MAIC weights (derived use in \code{\link{cal_weights}})
+#' @param wt a numeric vector of individual MAIC weights (derived using \code{\link{estimate_weights}})
 #' @param main_title a character string, main title of the plot
 #'
-#' @return a plot
+#' @return a plot of unscaled or scaled weights
 #' @importFrom graphics hist
 #' @export
-plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {
 
+plot_weights <- function(wt, main_title = "Unscaled Individual Weights") {
  # calculate sample size and exclude NA from wt
  nr_na <- sum(is.na(wt))
  n <- length(wt) - nr_na
@@ -126,7 +135,7 @@ plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {
  )
  plot_lty <- c(2, NA, NA)
 
- if (nr_na > 0){
+ if (nr_na > 0) {
  plot_legend <- c(plot_legend, paste0("#Missing Weights = ", nr_na))
  plot_lty <- c(plot_lty, NA)
  }
@@ -139,3 +148,41 @@ plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {
  legend("topright", bty = "n", lty = plot_lty, cex = 0.8, legend = plot_legend)
 }
 
+#' Check to see if weights are optimized correctly
+#'
+#' This function checks to see if the optimization is done properly by checking the covariate averages
+#' before and after adjustment.
+#'
+#' @param optimized object returned after calculating weights using \code{\link{estimate_weights}}
+#' @param match_cov covariates that should be checked to see if the IPD weighted average matches the aggregate data
+#' average. This could be same set of variables that were used to match or it can include variables that were not
+#' included to match (i.e. stratification variables)
+#' @param digits number of digits for rounding summary table
+#'
+#' @return data.frame of weighted and unweighted covariate averages of the IPD
+#' @export
+
+
+
+check_weights <- function(optimized, match_cov, digits = 2) {
+ if (missing(match_cov)) stop("match_cov is missing. Covariates to check must be defined.")
+ ipd_with_weights <- optimized$data
+
+ arm_names <- c("Unweighted IPD", "Weighted IPD")
+ ess <- c(nrow(ipd_with_weights), optimized$ess)
+
+ unweighted_cov <- sapply(ipd_with_weights[, match_cov, drop = FALSE], mean)
+
+ weighted_cov <- sapply(
+ ipd_with_weights[, match_cov, drop = FALSE],
+ weighted.mean,
+ w = ipd_with_weights$weights
+ )
+
+ cov_combined <- rbind(unweighted_cov, weighted_cov)
+
+ baseline_summary <- cbind(ess, cov_combined)
+ rownames(baseline_summary) <- arm_names
+
+ round(baseline_summary, digits = digits)
+}