Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New check #9

Merged
merged 21 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ Depends:
Imports:
graphics,
stats,
survival
survival,
MASS
Suggests:
knitr,
testthat (>= 2.0)
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

export(bucher)
export(center_ipd)
export(check_weights)
export(dummize_ipd)
export(estimate_weights)
export(generate_survival_data)
export(log_cum_haz_plot)
export(maic_tte_unanchor)
export(medSurv_makeup)
export(plot_weights)
export(process_agd)
export(process_ipd)
export(resid_plot)
export(survfit_makeup)
import(stats)
Expand Down
74 changes: 74 additions & 0 deletions R/data_generation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Generate time-to-event data
#'
#' Function generates time-to-event data. We follow the data-generating mechanism from
#' Bender et al. and Remiro-Azocar et al. 2021 to simulate Weibull-distributed survival times
#' under a proportional hazards parameterization. As per Remiro-Azocar et al, we set the
#' default scale parameter (lambda) to 8.5 and shape parameter (nu) to 1.3 to reflect frequently
#' observed mortality trends in metastatic cancer patients. Censoring times are generated from
#' exponential distribution where the rate parameter is 0.96 is selected to achieve a censoring rate
#' of 35% under the active treatment at baseline (with the values of the covariates set to zero).
#'
#' @param N sample size
#' @param ncov number of covariates to generate
#' @param lambda scale parameter in baseline hazard function
#' @param nu shape parameter in baseline hazard function
#' @param censor_rate rate parameter of the exponential distribution of censoring times
#' @param cor correlation between covariates
#' @param marginal_mean marginal mean of covariates
#' @param marginal_sd marginal standard deviation of covariates
#' @param beta vector of coefficients for prognostic factors. Defaults is `-log(0.67)` for all covariates.
#' @param gamma vector of coefficients for effect modifiers. Default is `0` for half of the covariates and
#' `-log(0.67)` for the others.
#' @param d0 coefficient for the treatment effect for patients with `treat = 0`
#' @param seed seed used for simulation reproducibility
#'
#' @return data.frame with survival time, censor (event) status, treatment indicator, and covariates
#'
#' @references Remiro-Azocar et al. 2021
#'
#' @export

generate_survival_data <- function(N = 150, ncov = 4, lambda = 8.5, nu = 1.3, censor_rate = 0.96,
cor = 0.35, marginal_mean = 0.6, marginal_sd = 0.2,
beta = rep(-log(0.67), ncov),
gamma = sort(rep(c(0, -log(0.67)), length = ncov)),
d0 = log(0.25), seed = 1) {
set.seed(seed)

# if beta and gamma is not specified, give it default values
if (is.null(beta)) {
beta <- rep(-log(0.67), ncov)
}

if (is.null(gamma)) {
gamma <- rep(-log(0.67), ncov)
gamma[1:round(ncov / 2)] <- 0
}

# generate ncov covariates (X) using a multivariate normal
sigma <- matrix(cor, nrow = ncov, ncol = ncov)
diag(sigma) <- 1
sigma <- sigma * marginal_sd^2

covariates <- MASS::mvrnorm(N, mu = rep(marginal_mean, ncov), Sigma = sigma)

treat <- sample(x = c(0, 1), size = N, replace = TRUE, prob = c(0.5, 0.5))

# linear predictor
lp <- covariates %*% beta + covariates %*% gamma * treat + d0 * treat

# Weibull latent event times
u <- runif(n = N)
latent_time <- (-log(u) / (lambda * exp(lp)))^(1 / nu)

# exponential censoring times
censor_time <- rexp(n = N, rate = censor_rate)

# follow-up times and event indicators
time <- pmin(latent_time, censor_time)
event <- as.numeric(latent_time <= censor_time)

colnames(covariates) <- paste0("x", seq_len(ncov))
survival_data <- data.frame(USUBJID = seq_len(N), time = time, event = event, treat = treat, x = covariates)
return(survival_data)
}
115 changes: 81 additions & 34 deletions R/matching.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,60 @@

#' Derive individual weights in the matching step of MAIC
#'
#' Assuming data is properly processed, this function takes individual patient data (IPD) with centered effect modifiers as input,
#' and generates weights for each individual in IPD trial that matches the chosen statistics of those effect modifiers in Aggregated Data (AgD) trial.
#' Assuming data is properly processed, this function takes individual patient data (IPD) with centered covariates
#' (effect modifiers and/or prognostic variables) as input, and generates weights for each individual in IPD trial that
#' matches the chosen statistics of those covariates in Aggregated Data (AgD) trial.
#'
#' @param data a numeric matrix, centered effect modifiers of IPD, no missing value in any cell is allowed
#' @param centered_colnames a character or numeric vector, column indicators of centered effect modifiers, by default NULL meaning all columns in \code{data} are effect modifiers
#' @param startVal a scalar, the starting value for all coefficients of the propensity score regression
#' @param data a numeric matrix, centered covariates of IPD, no missing value in any cell is allowed
#' @param centered_colnames a character or numeric vector (column indicators) of centered covariates
#' @param start_val a scalar, the starting value for all coefficients of the propensity score regression
#' @param method a string, name of the optimization algorithm (see 'method' argument of \code{base::optim()}).
#' The default is `"BFGS"`, other options are `"Nelder-Mead"`, `"CG"`, `"L-BFGS-B"`, `"SANN"`, and `"Brent"`
#' @param ... all other arguments from \code{base::optim()}
#'
#' @return a list with the following 4 elements,
#' \describe{
#' \item{data}{a data.frame, includes the input \code{data} with appended column 'weights' and 'scaled_weights'.
#' Scaled weights has a summation to be the number of rows in \code{data} that has no missing value in any of the effect modifiers}
#' \item{centered.colnames}{column names of centered effect modifiers in \code{data}}
#' \item{nr_missing}{number of rows in \code{data} that has at least 1 missing value in specified centered effect modifiers}
#' Scaled weights has a summation to be the number of rows in \code{data} that has no missing value in any of the
#' effect modifiers}
#' \item{centered_colnames}{column names of centered effect modifiers in \code{data}}
#' \item{nr_missing}{number of rows in \code{data} that has at least 1 missing value in specified centered effect
#' modifiers}
#' \item{ess}{effective sample size, square of sum divided by sum of squares}
#' \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details}
#' }
#' @export
#'
#' @examples
estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, method = "BFGS", ...) {
estimate_weights <- function(data, centered_colnames = NULL, start_val = 0, method = "BFGS", ...) {
# pre check
ch1 <- is.data.frame(data)
if(!ch1) stop("'data' is not a data.frame")
if (!ch1) stop("'data' is not a data.frame")

ch2 <- (!is.null(centered_colnames))
if(ch2 & is.numeric(centered_colnames)){
ch2b <- any(centered_colnames<1|centered_colnames>ncol(data))
if(ch2b) stop("specified centered_colnames are out of bound")
}else if(ch2 & is.character(centered_colnames)){
ch2b <- !all(centered_colnames%in%names(data))
if(ch2b) stop("1 or more specified centered_colnames are not found in 'data'")
}else{
if (ch2 && is.numeric(centered_colnames)) {
ch2b <- any(centered_colnames < 1 | centered_colnames > ncol(data))
if (ch2b) stop("specified centered_colnames are out of bound")
} else if (ch2 && is.character(centered_colnames)) {
ch2b <- !all(centered_colnames %in% names(data))
if (ch2b) stop("1 or more specified centered_colnames are not found in 'data'")
} else {
stop("'centered_colnames' should be either a numeric or character vector")
}

ch3 <- sapply(1:length(centered_colnames), function(ii){
!is.numeric(data[,centered_colnames[ii]])
ch3 <- sapply(seq_along(centered_colnames), function(ii) {
!is.numeric(data[, centered_colnames[ii]])
})
if(any(ch3)) stop(paste0("following columns of 'data' are not numeric for the calculation:",paste(which(ch3),collapse = ",")))
if (any(ch3)) {
stop(paste0("following columns of 'data' are not numeric for the calculation:", paste(which(ch3), collapse = ",")))
}

# prepare data for optimization
if(is.null(centered_colnames)) centered_colnames <- 1:ncol(data)
EM <- data[ , centered_colnames, drop=FALSE]
ind <- apply(EM,1,function(xx) any(is.na(xx)))
if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
EM <- data[, centered_colnames, drop = FALSE]
ind <- apply(EM, 1, function(xx) any(is.na(xx)))
nr_missing <- sum(ind)
EM <- as.matrix(EM[!ind,,drop=FALSE])
EM <- as.matrix(EM[!ind, , drop = FALSE])

# objective and gradient functions
objfn <- function(alpha, X) {
Expand All @@ -64,7 +69,7 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho

# estimate weights
opt1 <- optim(
par = rep(startVal, ncol(EM)),
par = rep(start_val, ncol(EM)),
fn = objfn, gr = gradfn,
X = EM,
method = method,
Expand All @@ -81,7 +86,7 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho
data$scaled_weights <- NA
data$scaled_weights[!ind] <- wt_rs

if(is.numeric(centered_colnames)) centered_colnames <- names(data)[centered_colnames]
if (is.numeric(centered_colnames)) centered_colnames <- names(data)[centered_colnames]

# Output
list(
Expand All @@ -96,19 +101,23 @@ estimate_weights <- function(data, centered_colnames = NULL, startVal = 0, metho

#' Plot MAIC weights in a histogram with key statistics in legend
#'
#' Generates a plot given the individuals weights with key summary in top right legend that
#' includes median weight, effective sample size (ESS), reduction percentage (what percent
#' ESS takes up in the original sample size),
#'
#' Generates a plot given the individuals weights with key summary in top right legend that includes
#' median weight, effective sample size (ESS), and reduction percentage (what percent ESS is reduced from the
#' original sample size).
#' There are two options of weights provided in \code{\link{estimate_weights}}: unscaled or scaled.
#' Scaled weights are relative to the original unit weights of each individual.
#' In other words, a scaled weight greater than 1 means that an individual carries more weight in the
#' re-weighted population than the original data and a scaled weight less than 1 means that an individual carries
#' less weight in the re-weighted population than the original data.
#'
#' @param wt a numeric vector of individual MAIC weights (derived use in \code{\link{cal_weights}})
#' @param wt a numeric vector of individual MAIC weights (derived using \code{\link{estimate_weights}})
#' @param main_title a character string, main title of the plot
#'
#' @return a plot
#' @return a plot of unscaled or scaled weights
#' @importFrom graphics hist
#' @export
plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {

plot_weights <- function(wt, main_title = "Unscaled Individual Weights") {
# calculate sample size and exclude NA from wt
nr_na <- sum(is.na(wt))
n <- length(wt) - nr_na
Expand All @@ -126,7 +135,7 @@ plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {
)
plot_lty <- c(2, NA, NA)

if (nr_na > 0){
if (nr_na > 0) {
plot_legend <- c(plot_legend, paste0("#Missing Weights = ", nr_na))
plot_lty <- c(plot_lty, NA)
}
Expand All @@ -139,3 +148,41 @@ plot_weights <- function(wt, main_title = "Unscaled Individual Weigths") {
legend("topright", bty = "n", lty = plot_lty, cex = 0.8, legend = plot_legend)
}

#' Check to see if weights are optimized correctly
#'
#' This function checks to see if the optimization is done properly by checking the covariate averages
#' before and after adjustment.
#'
#' @param optimized object returned after calculating weights using \code{\link{estimate_weights}}
#' @param match_cov covariates that should be checked to see if the IPD weighted average matches the aggregate data
#' average. This could be same set of variables that were used to match or it can include variables that were not
#' included to match (i.e. stratification variables)
#' @param digits number of digits for rounding summary table
#'
#' @return data.frame of weighted and unweighted covariate averages of the IPD
#' @export



check_weights <- function(optimized, match_cov, digits = 2) {
if (missing(match_cov)) stop("match_cov is missing. Covariates to check must be defined.")
ipd_with_weights <- optimized$data

arm_names <- c("Unweighted IPD", "Weighted IPD")
ess <- c(nrow(ipd_with_weights), optimized$ess)

unweighted_cov <- sapply(ipd_with_weights[, match_cov, drop = FALSE], mean)

weighted_cov <- sapply(
ipd_with_weights[, match_cov, drop = FALSE],
weighted.mean,
w = ipd_with_weights$weights
)

cov_combined <- rbind(unweighted_cov, weighted_cov)

baseline_summary <- cbind(ess, cov_combined)
rownames(baseline_summary) <- arm_names

round(baseline_summary, digits = digits)
}
Loading