From c3382d43824aa747be1848104ad3285e490c58f5 Mon Sep 17 00:00:00 2001 From: Gregory Chen Date: Fri, 26 Jul 2024 06:49:09 +0000 Subject: [PATCH 1/5] modify estimate_weight function --- R/matching.R | 31 ++++++++++++++++++++++++------- man/estimate_weights.Rd | 7 +++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/R/matching.R b/R/matching.R index 18520fda..39a075da 100644 --- a/R/matching.R +++ b/R/matching.R @@ -17,6 +17,9 @@ #' procedure will not be triggered, and hence the element `"boot"` of output list object will be NULL. #' @param set_seed_boot a scalar, the random seed for conducting the bootstrapping, only relevant if #' \code{n_boot_iteration} is not NULL. By default, use seed 1234 +#' @param boot_strata a character vector of column names in \code{data} that defines the strata for bootstrapping. +#' This ensures that samples are drawn proportionally from each defined stratum. If \code{NULL}, +#' no stratification during bootstrapping process. By default, it is "ARM" #' @param ... Additional `control` parameters passed to [stats::optim]. #' #' @return a list with the following 4 elements, @@ -29,13 +32,15 @@ #' modifiers} #' \item{ess}{effective sample size, square of sum divided by sum of squares} #' \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details} +#' \item{boot_strata}{'strata' from a boot::boot object} +#' \item{boot_seed}{column names in \code{data} of the stratification factors} #' \item{boot}{a n by 2 by k array or NA, where n equals to number of rows in \code{data}, and k equals #' \code{n_boot_iteration}. The 2 columns in the second dimension include a column of numeric indexes of the rows #' in \code{data} that are selected at a bootstrapping iteration and a column of weights. \code{boot} is NA when #' argument \code{n_boot_iteration} is set as NULL #' } #' } -#' +#' @importFrom boot boot #' @examples #' data(agd) #' data(adsl_sat) @@ -58,6 +63,7 @@ estimate_weights <- function(data, method = "BFGS", n_boot_iteration = NULL, set_seed_boot = 1234, + boot_strata = "ARM", ...) { # pre check ch1 <- is.data.frame(data) @@ -90,6 +96,13 @@ estimate_weights <- function(data, )) } + if(!is.null(boot_strata)) { + ch4 <- boot_strata %in% names(data) + if(!all(ch4)) { + stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ","))) + } + } + # prepare data for optimization if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data)) EM <- data[, centered_colnames, drop = FALSE] @@ -107,7 +120,7 @@ estimate_weights <- function(data, # bootstrapping outboot <- if (is.null(n_boot_iteration)) { boot_seed <- NULL - boot_strata <- NULL + boot_strata_out <- NULL NULL } else { # Make sure to leave '.Random.seed' as-is on exit @@ -115,17 +128,21 @@ estimate_weights <- function(data, on.exit(suspendInterrupts(set_random_seed(old_seed))) set.seed(set_seed_boot) - rowid_in_data <- which(!ind) - arms <- factor(data$ARM[rowid_in_data]) + if(!is.null(boot_strata)) { + use_strata <- subset(data, subset = (!ind), select = boot_strata) + use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor() + } else { + use_strata <- rep(1, nrow(EM)) + } boot_statistic <- function(d, w) optimise_weights(d[w, ], par = alpha, method = method, ...)$wt[, 1] - boot_out <- boot(EM, statistic = boot_statistic, R = n_boot_iteration, strata = arms) + boot_out <- boot::boot(EM, statistic = boot_statistic, R = n_boot_iteration, strata = use_strata) boot_array <- array(dim = list(nrow(EM), 2, n_boot_iteration)) dimnames(boot_array) <- list(sampled_patient = NULL, c("rowid", "weight"), bootstrap_iteration = NULL) boot_array[, 1, ] <- t(boot.array(boot_out, TRUE)) boot_array[, 2, ] <- t(boot_out$t) boot_seed <- boot_out$seed - boot_strata <- boot_out$strata + boot_strata_out <- boot_out$strata boot_array } @@ -147,7 +164,7 @@ estimate_weights <- function(data, opt = opt1$opt, boot = outboot, boot_seed = boot_seed, - boot_strata = boot_strata, + boot_strata = boot_strata_out, rows_with_missing = rows_with_missing ) diff --git a/man/estimate_weights.Rd b/man/estimate_weights.Rd index 73535f68..68da056d 100644 --- a/man/estimate_weights.Rd +++ b/man/estimate_weights.Rd @@ -12,6 +12,7 @@ estimate_weights( method = "BFGS", n_boot_iteration = NULL, set_seed_boot = 1234, + boot_strata = "ARM", ... ) @@ -42,6 +43,10 @@ procedure will not be triggered, and hence the element \code{"boot"} of output l \item{set_seed_boot}{a scalar, the random seed for conducting the bootstrapping, only relevant if \code{n_boot_iteration} is not NULL. By default, use seed 1234} +\item{boot_strata}{a character vector of column names in \code{data} that defines the strata for bootstrapping. +This ensures that samples are drawn proportionally from each defined stratum. If \code{NULL}, +no stratification during bootstrapping process. By default, it is "ARM"} + \item{...}{Additional \code{control} parameters passed to \link[stats:optim]{stats::optim}.} \item{x}{object from \link{estimate_weights}} @@ -69,6 +74,8 @@ effect modifiers} modifiers} \item{ess}{effective sample size, square of sum divided by sum of squares} \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details} +\item{boot_strata}{'strata' from a boot::boot object} +\item{boot_seed}{column names in \code{data} of the stratification factors} \item{boot}{a n by 2 by k array or NA, where n equals to number of rows in \code{data}, and k equals \code{n_boot_iteration}. The 2 columns in the second dimension include a column of numeric indexes of the rows in \code{data} that are selected at a bootstrapping iteration and a column of weights. \code{boot} is NA when From 5117a52e72706586d5a1c7aebef8af5619cb977e Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 06:51:17 +0000 Subject: [PATCH 2/5] [skip style] [skip vbump] Restyle files --- R/matching.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/matching.R b/R/matching.R index 39a075da..037afec0 100644 --- a/R/matching.R +++ b/R/matching.R @@ -96,10 +96,10 @@ estimate_weights <- function(data, )) } - if(!is.null(boot_strata)) { + if (!is.null(boot_strata)) { ch4 <- boot_strata %in% names(data) - if(!all(ch4)) { - stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ","))) + if (!all(ch4)) { + stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ","))) } } @@ -128,7 +128,7 @@ estimate_weights <- function(data, on.exit(suspendInterrupts(set_random_seed(old_seed))) set.seed(set_seed_boot) - if(!is.null(boot_strata)) { + if (!is.null(boot_strata)) { use_strata <- subset(data, subset = (!ind), select = boot_strata) use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor() } else { From 5cf421bd53692229a2c58f981bc6e9a548844301 Mon Sep 17 00:00:00 2001 From: Gregory Chen <65814078+hoppanda@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:48:24 +0200 Subject: [PATCH 3/5] Update R/matching.R Co-authored-by: Isaac Gravestock <83659704+gravesti@users.noreply.github.com> --- R/matching.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/matching.R b/R/matching.R index 037afec0..c37c5c62 100644 --- a/R/matching.R +++ b/R/matching.R @@ -129,8 +129,7 @@ estimate_weights <- function(data, set.seed(set_seed_boot) if (!is.null(boot_strata)) { - use_strata <- subset(data, subset = (!ind), select = boot_strata) - use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor() + use_strata <- interaction(data[!ind, boot_strata]) } else { use_strata <- rep(1, nrow(EM)) } From f1ab6996b386c2c12fe95c0eabfcc9a583336834 Mon Sep 17 00:00:00 2001 From: Gregory Chen <65814078+hoppanda@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:48:39 +0200 Subject: [PATCH 4/5] Update R/matching.R Co-authored-by: Isaac Gravestock <83659704+gravesti@users.noreply.github.com> --- R/matching.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/matching.R b/R/matching.R index c37c5c62..a3e7a8c5 100644 --- a/R/matching.R +++ b/R/matching.R @@ -99,7 +99,7 @@ estimate_weights <- function(data, if (!is.null(boot_strata)) { ch4 <- boot_strata %in% names(data) if (!all(ch4)) { - stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ","))) + stop("Some variables in boot_strata are not in data:", toString(boot_strata[!ch4])) } } From 379d3dc64b04c87419c1e3e4ebdf6e8ab672f49e Mon Sep 17 00:00:00 2001 From: Isaac Gravestock Date: Thu, 15 Aug 2024 13:36:08 +0200 Subject: [PATCH 5/5] add test --- R/matching.R | 2 +- tests/testthat/test-matching.R | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/R/matching.R b/R/matching.R index a3e7a8c5..747d9b91 100644 --- a/R/matching.R +++ b/R/matching.R @@ -99,7 +99,7 @@ estimate_weights <- function(data, if (!is.null(boot_strata)) { ch4 <- boot_strata %in% names(data) if (!all(ch4)) { - stop("Some variables in boot_strata are not in data:", toString(boot_strata[!ch4])) + stop("Some variables in boot_strata are not in data: ", toString(boot_strata[!ch4])) } } diff --git a/tests/testthat/test-matching.R b/tests/testthat/test-matching.R index b87c387d..271eb174 100644 --- a/tests/testthat/test-matching.R +++ b/tests/testthat/test-matching.R @@ -96,6 +96,40 @@ test_that("estimate_weights works as expected with bootstrapping", { }) +test_that("estimate_weights works as expected with alternative bootstrap strata", { + load(system.file("extdata", "ipd.rda", package = "maicplus", mustWork = TRUE)) + load(system.file("extdata", "agd.rda", package = "maicplus", mustWork = TRUE)) + ipd_centered <- center_ipd(ipd = ipd, agd = agd) + centered_colnames <- paste0(c("AGE", "AGE_SQUARED", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN"), "_CENTERED") + expect_output( + result <- estimate_weights( + data = ipd_centered, + centered_colnames = centered_colnames, + n_boot_iteration = 3, + set_seed = 999, + trace = 2, + boot_strata = c("ARM", "SEX") + ), + "converged" + ) + + expect_s3_class(result, "maicplus_estimate_weights") + expect_equal(sum(result$data$weights), 206.83843133) + + expect_error( + result <- estimate_weights( + data = ipd_centered, + centered_colnames = centered_colnames, + n_boot_iteration = 3, + set_seed = 999, + trace = 2, + boot_strata = "FISH" + ), + "boot_strata are not in data" + ) +}) + + test_that("estimate_weights works when the input is a tibble", { skip_if_not_installed("tibble") load(system.file("extdata", "ipd.rda", package = "maicplus", mustWork = TRUE))