hta-pharma · gravesti · Jun 21, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,6 +13,7 @@ repos:
           - shiny
           - lubridate
           - DescTools
+          - lmtest
       # codemeta must be above use-tidy-description when both are used
       # -   id: codemeta-description-updated
       - id: use-tidy-description

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(basic_kmplot)
 export(basic_kmplot2)
 export(bootstrap_HR)
 export(bucher)
+export(calculate_weights_legend)
 export(center_ipd)
 export(check_weights)
 export(dummize_ipd)

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,220 @@
+# unanchored datasets ------
+
+#' Patient data from single arm study
+#' @format a data frame with 500 rows and 8 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm.}
+#'     \item{AGE}{Age in years at baseline.}
+#'     \item{SEX}{Sex of patient recorded as character `"Male"`/`"Female"`.}
+#'     \item{SMOKE}{Smoking status at baseline as integer `1`/`0`.}
+#'     \item{ECOG0}{Indicator of ECOG score = 0 at baseline as integer `1`/`0`.}
+#'     \item{N_PR_THER}{Number of prior therapies received as integer `1, 2, 3, 4`.}
+#'     \item{SEX_MALE}{Indicator of `SEX == "Male"` as numeric `1`/`0`.}
+#'
+#'   }
+#' @keywords dataset
+#' @family unanchored datasets
+"adsl_sat"
+
+#' Survival data from single arm trial
+#' @format A data frame with 500 rows and 10 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm, `"A"`.}
+#'     \item{AVAL}{Analysis value which in this dataset overall survival time in days.}
+#'     \item{AVALU}{Unit of `AVAL`.}
+#'     \item{PARAMCD}{Paramater code of `AVAL`, `"OS"`.}
+#'     \item{PARAM}{Parameter name of `AVAL`, `"Overall Survival`.}
+#'     \item{CNSR}{Censoring indicator `0`/`1`.}
+#'     \item{TIME}{Survival time in days.}
+#'     \item{EVENT}{Event indicator `0`/`1`.}
+#'   }
+#' @family unanchored datasets
+#' @keywords dataset
+"adtte_sat"
+
+
+#' Pseudo individual patient survival data from published study
+#' @format A data frame with 300 rows and 3 columns:
+#'   \describe{
+#'     \item{Time}{Survival time in days.}
+#'     \item{Event}{Event indicator `0`/`1`.}
+#'     \item{ARM}{Assigned treatment arm, `"B"`.}
+#'   }
+#' @family unanchored datasets
+#' @keywords dataset
+"pseudo_ipd_sat"
+
+
+#' Centered patient data from single arm trial
+#' @format A data frame with 500 rows and 14 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm.}
+#'     \item{AGE}{Age in years at baseline.}
+#'     \item{SEX}{Sex of patient recorded as character `"Male"`/`"Female"`.}
+#'     \item{SMOKE}{Smoking status at baseline as integer `1`/`0`.}
+#'     \item{ECOG0}{Indicator of ECOG score = 0 at baseline as integer `1`/`0`.}
+#'     \item{N_PR_THER}{Number of prior therapies received as integer `1, 2, 3, 4`.}
+#'     \item{SEX_MALE}{Indicator of `SEX == "Male"` as numeric `1`/`0`.}
+#'     \item{AGE_CENTERED}{Age in years at baseline relative to average in aggregate data [agd].}
+#'     \item{AGE_MEDIAN_CENTERED}{`AGE` greater/less than `MEDIAN_AGE` in [agd] coded as `1`/`0` and then centered at
+#'      0.5.}
+#'     \item{AGE_SQUARED_CENTERED}{`AGE` squared and centered with respect to the `AGE` in [agd]. The squared age in the
+#'       aggregate data is derived from the \eqn{E(X^2)} term in the variance formula.}
+#'     \item{SEX_MALE_CENTERED}{`SEX_MALE` centered by the proportion of male patients in [agd]}
+#'     \item{ECOG0_CENTERED}{`ECOG0` centered by the proportion of `ECOG0` in [agd]}
+#'     \item{SMOKE_CENTERED}{`SMOKE` centered by the proportion of `SMOKE` in [agd]}
+#'     \item{N_PR_THER_MEDIAN_CENTERED}{`N_PR_THER` centered by the median in [agd].}
+#'   }
+#' @family unanchored datasets
+#' @keywords dataset
+"centered_ipd_sat"
+
+#' Binary outcome data from single arm trial
+#' @format A data frame with 500 rows and 5 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm.}
+#'     \item{AVAL}{Analysis value, in this dataset an indicator of response.}
+#'     \item{PARAM}{Parameter type of `AVAL`.}
+#'     \item{RESPONSE}{Indicator of response.}
+#'   }
+#' @family unanchored datasets
+#' @keywords dataset
+"adrs_sat"
+
+#' Weighted object for single arm trial data
+#' @format A `maicplus_estimate_weights` object created by [estimate_weights()] containing
+#'   \describe{
+#'     \item{data}{patient level data with weights}
+#'     \item{centered_colnames}{Columns used in MAIC}
+#'     \item{nr_missing}{Number of observations with missing data}
+#'     \item{ess}{Expected sample size}
+#'     \item{opt}{Information from `optim` from weight calculation}
+#'     \item{boot}{Parameters and bootstrap sample weights, `NULL` in this object}
+#'   }
+#' @family unanchored datasets
+#' @keywords dataset
+"weighted_sat"
+
+# aggregate data ------
+
+#' Aggregate effect modifier data from published study
+#'
+#' This data is formatted to be used in [center_ipd()].
+#'
+#' @format A data frame with 3 rows and 9 columns:
+#'   \describe{
+#'     \item{STUDY}{The study name, Study_XXXX}
+#'     \item{ARM}{Study arm name or total}
+#'     \item{N}{Number of observations in study arm}
+#'     \item{AGE_MEAN}{Mean age in study arm}
+#'     \item{AGE_MEDIAN}{Median age in study arm}
+#'     \item{AGE_SD}{Standard deviation of age in study arm}
+#'     \item{SEX_MALE_COUNT}{Number of male patients}
+#'     \item{ECOG0_COUNT}{Number of patients with ECOG score = 0}
+#'     \item{SMOKE_COUNT}{Number of smokers}
+#'     \item{N_PR_THER_MEDIAN}{Median number of prior therapies}
+#'   }
+#' @family unanchored datasets
+#' @family anchored datasets
+#' @keywords dataset
+"agd"
+
+
+# anchored datasets -------
+
+#' Patient data from two arm trial
+#' @format A data frame with 1000 rows and 8 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm.}
+#'     \item{AGE}{Age in years at baseline.}
+#'     \item{SEX}{Sex of patient recorded as character "Male"/"Female"}
+#'     \item{SMOKE}{Smoking status at baseline as integer `1`/`0`.}
+#'     \item{ECOG0}{Indicator of ECOG score = 0 at baseline as integer `1`/`0`.}
+#'     \item{N_PR_THER}{Number of prior therapies received as integer `1, 2, 3, 4`.}
+#'     \item{SEX_MALE}{Indicator of SEX == "Male" as numeric 1/0}
+#'   }
+#' @family anchored datasets
+#' @keywords dataset
+"adsl_twt"
+
+
+#' Survival data from two arm trial
+#' @format A data frame with 1000 rows and 10 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm, `"A"`, `"C"`.}
+#'     \item{AVAL}{Analysis value which in this dataset overall survival time in days.}
+#'     \item{AVALU}{Unit of `AVAL`.}
+#'     \item{PARAMCD}{Parameter code of `AVAL`, `"OS"`.}
+#'     \item{PARAM}{Parameter name of `AVAL`, `"Overall Survival`.}
+#'     \item{CNSR}{Censoring indicator `0`/`1`.}
+#'     \item{TIME}{Survival time in days.}
+#'     \item{EVENT}{Event indicator `0`/`1`.}
+#'   }
+#' @family anchored datasets
+#' @keywords dataset
+"adtte_twt"
+
+#' Binary outcome data from two arm trial
+#' @format A data frame with 1000 rows and 5 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm, `"A"`, `"C"`.}
+#'     \item{AVAL}{Analysis value, in this dataset an indicator of response.}
+#'     \item{PARAM}{Parameter type of `AVAL`.}
+#'     \item{RESPONSE}{Indicator of response.}
+#'   }
+"adrs_twt"
+
+#' Pseudo individual patient survival data from published two arm study
+#' @format A data frame with 800 rows and 3 columns:
+#'   \describe{
+#'     \item{Time}{Survival time in days.}
+#'     \item{Event}{Event indicator `0`/`1`.}
+#'     \item{ARM}{Assigned treatment arm, `"B"`, `"C"`.}
+#'   }
+#' @family anchored datasets
+#' @keywords dataset
+"pseudo_ipd_twt"
+
+
+#' Centered patient data from two arm trial
+#' @format A data frame with 1000 rows and 14 columns:
+#'   \describe{
+#'     \item{USUBJID}{Unique subject identifiers for patients.}
+#'     \item{ARM}{Assigned treatment arm.}
+#'     \item{AGE}{Age in years at baseline.}
+#'     \item{SEX}{Sex of patient recorded as character `"Male"`/`"Female"`.}
+#'     \item{SMOKE}{Smoking status at baseline as integer `1`/`0`.}
+#'     \item{ECOG0}{Indicator of ECOG score = 0 at baseline as integer `1`/`0`.}
+#'     \item{N_PR_THER}{Number of prior therapies received as integer `1, 2, 3, 4`.}
+#'     \item{SEX_MALE}{Indicator of `SEX == "Male"` as numeric `1`/`0`.}
+#'     \item{AGE_CENTERED}{Age in years at baseline relative to average in aggregate data [agd].}
+#'     \item{AGE_MEDIAN_CENTERED}{`AGE` greater/less than `MEDIAN_AGE` in [agd] coded as `1`/`0` and then centered at
+#'      0.5.}
+#'     \item{AGE_SQUARED_CENTERED}{`AGE` squared and centered with respect to the `AGE` in [agd]. The squared age in the
+#'       aggregate data is derived from the \eqn{E(X^2)} term in the variance formula.}
+#'     \item{SEX_MALE_CENTERED}{`SEX_MALE` centered by the proportion of male patients in [agd]}
+#'     \item{ECOG0_CENTERED}{`ECOG0` centered by the proportion of `ECOG0` in [agd]}
+#'     \item{SMOKE_CENTERED}{`SMOKE` centered by the proportion of `SMOKE` in [agd]}
+#'     \item{N_PR_THER_MEDIAN_CENTERED}{`N_PR_THER` centered by the median in [agd].}
+#'   }
+#' @keywords dataset
+#' @family anchored datasets
+"centered_ipd_twt"
+
+
+if (FALSE) {
+  make_roxygen_data <- function(df) {
+    cn <- colnames(df)
+    cat("#' @format A data frame with", nrow(df), "rows and", ncol(df), "columns:\n")
+    cat("#'   \\describe{\n")
+    for (i in cn) cat("#'     \\item{", i, "}{}\n", sep = "")
+    cat("#'   }")
+  }
+}
diff --git a/R/matching.R b/R/matching.R
@@ -37,14 +37,19 @@
 #' }
 #'
 #' @examples
-#' load(system.file("extdata", "ipd.rda", package = "maicplus", mustWork = TRUE))
-#' load(system.file("extdata", "agd.rda", package = "maicplus", mustWork = TRUE))
-#' ipd_centered <- center_ipd(ipd = ipd, agd = agd)
-#'
-#' centered_colnames <- c("AGE", "AGE_SQUARED", "SEX_MALE", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN")
-#' centered_colnames <- paste0(centered_colnames, "_CENTERED")
+#' data(agd)
+#' data(adsl_sat)
+#' ipd_centered <- center_ipd(ipd = adsl_sat, agd = process_agd(agd))
+#' centered_colnames <- grep("_CENTERED", colnames(ipd_centered), value = TRUE)
+#' centered_colnames
 #' weighted_data <- estimate_weights(data = ipd_centered, centered_colnames = centered_colnames)
-#'
+#' \donttest{
+#' # To later estimate bootstrap confidence intervals, we calculate the weights
+#' # for the bootstrap samples:
+#' weighted_data_boot <- estimate_weights(
+#'   data = ipd_centered, centered_colnames = centered_colnames, n_boot_iteration = 500
+#' )
+#' }
 #' @export
 
 estimate_weights <- function(data,
@@ -199,10 +204,9 @@ optimise_weights <- function(matrix,
 #'
 #' @return list of ESS, ESS reduction, median value of scaled and unscaled weights, and missing count
 #' @examples
-#' \dontrun{
-#' load(system.file("extdata", "weighted_data.rda", package = "maicplus", mustWork = TRUE))
-#' calculate_weights_legend(weighted_data)
-#' }
+#' data("weighted_sat")
+#' calculate_weights_legend(weighted_sat)
+#' @export
 #' @keywords internal
 
 calculate_weights_legend <- function(weighted_data) {
@@ -362,11 +366,11 @@ plot_weights_ggplot <- function(weighted_data, bin_col, vline_col,
 #' @param bins (`ggplot` only) number of bin parameter to use
 #'
 #' @examples
-#' load(system.file("extdata", "weighted_data.rda", package = "maicplus", mustWork = TRUE))
-#' plot(weighted_data)
+#' plot(weighted_sat)
 #'
-#' library(ggplot2)
-#' plot(weighted_data, ggplot = TRUE)
+#' if (requireNamespace("ggplot2")) {
+#'   plot(weighted_sat, ggplot = TRUE)
+#' }
 #' @describeIn estimate_weights Plot method for estimate_weights objects
 #' @export
 
@@ -397,9 +401,7 @@ plot.maicplus_estimate_weights <- function(x, ggplot = FALSE,
 #' aggregated data following the same naming convention
 #'
 #' @examples
-#' load(system.file("extdata", "weighted_data.rda", package = "maicplus", mustWork = TRUE))
-#' load(system.file("extdata", "agd.rda", package = "maicplus", mustWork = TRUE))
-#' check_weights(weighted_data, agd)
+#' check_weights(weighted_sat, process_agd(agd))
 #'
 #' @import DescTools
 #'

diff --git a/R/process_data.R b/R/process_data.R
@@ -114,8 +114,8 @@ process_agd <- function(raw_agd) {
 #' @param dummize_ref_level vector of reference level of the variables to binarize
 #'
 #' @examples
-#' adsl <- read.csv(system.file("extdata", "adsl.csv", package = "maicplus", mustWork = TRUE))
-#' adsl <- dummize_ipd(adsl, dummize_cols = c("SEX"), dummize_ref_level = c("Female"))
+#' data(adsl_twt)
+#' dummize_ipd(adsl_twt, dummize_cols = c("SEX"), dummize_ref_level = c("Male"))
 #'
 #' @return ipd with dummized columns
 #' @export
@@ -152,8 +152,8 @@ dummize_ipd <- function(raw_ipd, dummize_cols, dummize_ref_level) {
 #' suffix is no longer accepted.
 #' @examples
 #' # load in IPD
-#' adsl <- read.csv(system.file("extdata", "adsl.csv", package = "maicplus", mustWork = TRUE))
-#' adsl <- dummize_ipd(adsl, dummize_cols = c("SEX"), dummize_ref_level = c("Female"))
+#' data(adsl_sat)
+#' adsl <- dummize_ipd(adsl_sat, dummize_cols = c("SEX"), dummize_ref_level = c("Female"))
 #'
 #' # Reading aggregate data by Excel
 #' target_pop <- read.csv(
@@ -162,7 +162,7 @@ dummize_ipd <- function(raw_ipd, dummize_cols, dummize_ref_level) {
 #' agd <- process_agd(target_pop)
 #'
 #' # Alternatively, you can specify aggregate data manually in data frame
-#' load(system.file("extdata", "agd.rda", package = "maicplus", mustWork = TRUE))
+#' data(agd)
 #' ipd_centered <- center_ipd(ipd = adsl, agd = agd)
 #'
 #' @return centered ipd using aggregate level data averages