From c3382d43824aa747be1848104ad3285e490c58f5 Mon Sep 17 00:00:00 2001
From: Gregory Chen <stat1013@gmail.com>
Date: Fri, 26 Jul 2024 06:49:09 +0000
Subject: [PATCH 1/5] modify estimate_weight function

---
 R/matching.R            | 31 ++++++++++++++++++++++++-------
 man/estimate_weights.Rd |  7 +++++++
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/R/matching.R b/R/matching.R
index 18520fda..39a075da 100644
--- a/R/matching.R
+++ b/R/matching.R
@@ -17,6 +17,9 @@
 #'   procedure will not be triggered, and hence the element `"boot"` of output list object will be NULL.
 #' @param set_seed_boot a scalar, the random seed for conducting the bootstrapping, only relevant if
 #'   \code{n_boot_iteration} is not NULL. By default, use seed 1234
+#' @param boot_strata a character vector of column names in \code{data} that defines the strata for bootstrapping.
+#'   This ensures that samples are drawn proportionally from each defined stratum. If \code{NULL},
+#'   no stratification during bootstrapping process. By default, it is "ARM"
 #' @param ... Additional `control` parameters passed to [stats::optim].
 #'
 #' @return a list with the following 4 elements,
@@ -29,13 +32,15 @@
 #'   modifiers}
 #'   \item{ess}{effective sample size, square of sum divided by sum of squares}
 #'   \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details}
+#'   \item{boot_strata}{'strata' from a boot::boot object}
+#'   \item{boot_seed}{column names in \code{data} of the stratification factors}
 #'   \item{boot}{a n by 2 by k array or NA, where n equals to number of rows in \code{data}, and k equals
 #'      \code{n_boot_iteration}. The 2 columns in the second dimension include a column of numeric indexes of the rows
 #'      in \code{data} that are selected at a bootstrapping iteration and a column of weights. \code{boot} is NA when
 #'      argument \code{n_boot_iteration} is set as NULL
 #'   }
 #' }
-#'
+#' @importFrom boot boot
 #' @examples
 #' data(agd)
 #' data(adsl_sat)
@@ -58,6 +63,7 @@ estimate_weights <- function(data,
                              method = "BFGS",
                              n_boot_iteration = NULL,
                              set_seed_boot = 1234,
+                             boot_strata = "ARM",
                              ...) {
   # pre check
   ch1 <- is.data.frame(data)
@@ -90,6 +96,13 @@ estimate_weights <- function(data,
     ))
   }
 
+  if(!is.null(boot_strata)) {
+    ch4 <- boot_strata %in% names(data)
+    if(!all(ch4)) {
+       stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ",")))
+    }
+  }
+
   # prepare data for optimization
   if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
   EM <- data[, centered_colnames, drop = FALSE]
@@ -107,7 +120,7 @@ estimate_weights <- function(data,
   # bootstrapping
   outboot <- if (is.null(n_boot_iteration)) {
     boot_seed <- NULL
-    boot_strata <- NULL
+    boot_strata_out <- NULL
     NULL
   } else {
     # Make sure to leave '.Random.seed' as-is on exit
@@ -115,17 +128,21 @@ estimate_weights <- function(data,
     on.exit(suspendInterrupts(set_random_seed(old_seed)))
     set.seed(set_seed_boot)
 
-    rowid_in_data <- which(!ind)
-    arms <- factor(data$ARM[rowid_in_data])
+    if(!is.null(boot_strata)) {
+      use_strata <- subset(data, subset = (!ind), select = boot_strata)
+      use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor()
+    } else {
+      use_strata <- rep(1, nrow(EM))
+    }
     boot_statistic <- function(d, w) optimise_weights(d[w, ], par = alpha, method = method, ...)$wt[, 1]
-    boot_out <- boot(EM, statistic = boot_statistic, R = n_boot_iteration, strata = arms)
+    boot_out <- boot::boot(EM, statistic = boot_statistic, R = n_boot_iteration, strata = use_strata)
 
     boot_array <- array(dim = list(nrow(EM), 2, n_boot_iteration))
     dimnames(boot_array) <- list(sampled_patient = NULL, c("rowid", "weight"), bootstrap_iteration = NULL)
     boot_array[, 1, ] <- t(boot.array(boot_out, TRUE))
     boot_array[, 2, ] <- t(boot_out$t)
     boot_seed <- boot_out$seed
-    boot_strata <- boot_out$strata
+    boot_strata_out <- boot_out$strata
     boot_array
   }
 
@@ -147,7 +164,7 @@ estimate_weights <- function(data,
     opt = opt1$opt,
     boot = outboot,
     boot_seed = boot_seed,
-    boot_strata = boot_strata,
+    boot_strata = boot_strata_out,
     rows_with_missing = rows_with_missing
   )
 
diff --git a/man/estimate_weights.Rd b/man/estimate_weights.Rd
index 73535f68..68da056d 100644
--- a/man/estimate_weights.Rd
+++ b/man/estimate_weights.Rd
@@ -12,6 +12,7 @@ estimate_weights(
   method = "BFGS",
   n_boot_iteration = NULL,
   set_seed_boot = 1234,
+  boot_strata = "ARM",
   ...
 )
 
@@ -42,6 +43,10 @@ procedure will not be triggered, and hence the element \code{"boot"} of output l
 \item{set_seed_boot}{a scalar, the random seed for conducting the bootstrapping, only relevant if
 \code{n_boot_iteration} is not NULL. By default, use seed 1234}
 
+\item{boot_strata}{a character vector of column names in \code{data} that defines the strata for bootstrapping.
+This ensures that samples are drawn proportionally from each defined stratum. If \code{NULL},
+no stratification during bootstrapping process. By default, it is "ARM"}
+
 \item{...}{Additional \code{control} parameters passed to \link[stats:optim]{stats::optim}.}
 
 \item{x}{object from \link{estimate_weights}}
@@ -69,6 +74,8 @@ effect modifiers}
 modifiers}
 \item{ess}{effective sample size, square of sum divided by sum of squares}
 \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details}
+\item{boot_strata}{'strata' from a boot::boot object}
+\item{boot_seed}{column names in \code{data} of the stratification factors}
 \item{boot}{a n by 2 by k array or NA, where n equals to number of rows in \code{data}, and k equals
 \code{n_boot_iteration}. The 2 columns in the second dimension include a column of numeric indexes of the rows
 in \code{data} that are selected at a bootstrapping iteration and a column of weights. \code{boot} is NA when

From 5117a52e72706586d5a1c7aebef8af5619cb977e Mon Sep 17 00:00:00 2001
From: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 26 Jul 2024 06:51:17 +0000
Subject: [PATCH 2/5] [skip style] [skip vbump] Restyle files

---
 R/matching.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/matching.R b/R/matching.R
index 39a075da..037afec0 100644
--- a/R/matching.R
+++ b/R/matching.R
@@ -96,10 +96,10 @@ estimate_weights <- function(data,
     ))
   }
 
-  if(!is.null(boot_strata)) {
+  if (!is.null(boot_strata)) {
     ch4 <- boot_strata %in% names(data)
-    if(!all(ch4)) {
-       stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ",")))
+    if (!all(ch4)) {
+      stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ",")))
     }
   }
 
@@ -128,7 +128,7 @@ estimate_weights <- function(data,
     on.exit(suspendInterrupts(set_random_seed(old_seed)))
     set.seed(set_seed_boot)
 
-    if(!is.null(boot_strata)) {
+    if (!is.null(boot_strata)) {
       use_strata <- subset(data, subset = (!ind), select = boot_strata)
       use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor()
     } else {

From 5cf421bd53692229a2c58f981bc6e9a548844301 Mon Sep 17 00:00:00 2001
From: Gregory Chen <65814078+hoppanda@users.noreply.github.com>
Date: Sun, 11 Aug 2024 21:48:24 +0200
Subject: [PATCH 3/5] Update R/matching.R

Co-authored-by: Isaac Gravestock <83659704+gravesti@users.noreply.github.com>
---
 R/matching.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/matching.R b/R/matching.R
index 037afec0..c37c5c62 100644
--- a/R/matching.R
+++ b/R/matching.R
@@ -129,8 +129,7 @@ estimate_weights <- function(data,
     set.seed(set_seed_boot)
 
     if (!is.null(boot_strata)) {
-      use_strata <- subset(data, subset = (!ind), select = boot_strata)
-      use_strata <- apply(use_strata, 1, paste, collapse = "--") |> factor()
+      use_strata <- interaction(data[!ind, boot_strata])
     } else {
       use_strata <- rep(1, nrow(EM))
     }

From f1ab6996b386c2c12fe95c0eabfcc9a583336834 Mon Sep 17 00:00:00 2001
From: Gregory Chen <65814078+hoppanda@users.noreply.github.com>
Date: Sun, 11 Aug 2024 21:48:39 +0200
Subject: [PATCH 4/5] Update R/matching.R

Co-authored-by: Isaac Gravestock <83659704+gravesti@users.noreply.github.com>
---
 R/matching.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/matching.R b/R/matching.R
index c37c5c62..a3e7a8c5 100644
--- a/R/matching.R
+++ b/R/matching.R
@@ -99,7 +99,7 @@ estimate_weights <- function(data,
   if (!is.null(boot_strata)) {
     ch4 <- boot_strata %in% names(data)
     if (!all(ch4)) {
-      stop(paste("follow items in boot_strata does not exist in names(data):", paste(boot_strata[!ch4], collapse = ",")))
+      stop("Some variables in boot_strata are not in data:", toString(boot_strata[!ch4]))
     }
   }
 

From 379d3dc64b04c87419c1e3e4ebdf6e8ab672f49e Mon Sep 17 00:00:00 2001
From: Isaac Gravestock <isaac.gravestock@roche.com>
Date: Thu, 15 Aug 2024 13:36:08 +0200
Subject: [PATCH 5/5] add test

---
 R/matching.R                   |  2 +-
 tests/testthat/test-matching.R | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/R/matching.R b/R/matching.R
index a3e7a8c5..747d9b91 100644
--- a/R/matching.R
+++ b/R/matching.R
@@ -99,7 +99,7 @@ estimate_weights <- function(data,
   if (!is.null(boot_strata)) {
     ch4 <- boot_strata %in% names(data)
     if (!all(ch4)) {
-      stop("Some variables in boot_strata are not in data:", toString(boot_strata[!ch4]))
+      stop("Some variables in boot_strata are not in data: ", toString(boot_strata[!ch4]))
     }
   }
 
diff --git a/tests/testthat/test-matching.R b/tests/testthat/test-matching.R
index b87c387d..271eb174 100644
--- a/tests/testthat/test-matching.R
+++ b/tests/testthat/test-matching.R
@@ -96,6 +96,40 @@ test_that("estimate_weights works as expected with bootstrapping", {
 })
 
 
+test_that("estimate_weights works as expected with alternative bootstrap strata", {
+  load(system.file("extdata", "ipd.rda", package = "maicplus", mustWork = TRUE))
+  load(system.file("extdata", "agd.rda", package = "maicplus", mustWork = TRUE))
+  ipd_centered <- center_ipd(ipd = ipd, agd = agd)
+  centered_colnames <- paste0(c("AGE", "AGE_SQUARED", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN"), "_CENTERED")
+  expect_output(
+    result <- estimate_weights(
+      data = ipd_centered,
+      centered_colnames = centered_colnames,
+      n_boot_iteration = 3,
+      set_seed = 999,
+      trace = 2,
+      boot_strata = c("ARM", "SEX")
+    ),
+    "converged"
+  )
+
+  expect_s3_class(result, "maicplus_estimate_weights")
+  expect_equal(sum(result$data$weights), 206.83843133)
+
+  expect_error(
+    result <- estimate_weights(
+      data = ipd_centered,
+      centered_colnames = centered_colnames,
+      n_boot_iteration = 3,
+      set_seed = 999,
+      trace = 2,
+      boot_strata = "FISH"
+    ),
+    "boot_strata are not in data"
+  )
+})
+
+
 test_that("estimate_weights works when the input is a tibble", {
   skip_if_not_installed("tibble")
   load(system.file("extdata", "ipd.rda", package = "maicplus", mustWork = TRUE))