hta-pharma · gravesti · Jul 28, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023
diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml
@@ -27,7 +27,7 @@ jobs:
     with:
       additional-r-cmd-check-params: --as-cran
   coverage:
-    name: Coverage 📔 
+    name: Coverage 📔
     uses: insightsengineering/r.pkg.template/.github/workflows/test-coverage.yaml@main
     secrets:
       REPO_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
     uses: insightsengineering/r.pkg.template/.github/workflows/style.yaml@main
     with:
       auto-update: true
-  grammar:
-    if: github.event_name == 'pull_request'
-    name: Grammar Check 🔤
-    uses: insightsengineering/r.pkg.template/.github/workflows/grammar.yaml@main
+  # grammar:
+  #   if: github.event_name == 'pull_request'
+  #   name: Grammar Check 🔤
+  #   uses: insightsengineering/r.pkg.template/.github/workflows/grammar.yaml@main
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,6 +15,7 @@ Imports:
     graphics,
     stats,
     survival,
+    DescTools,
     MASS
 Suggests:
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(plot_weights)
 export(process_agd)
 export(resid_plot)
 export(survfit_makeup)
+import(DescTools)
 import(stats)
 importFrom(graphics,abline)
 importFrom(graphics,axis)

diff --git a/R/matching.R b/R/matching.R
@@ -54,16 +54,16 @@

  # prepare data for optimization
  if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
  EM <- data[, centered_colnames, drop = FALSE]
  ind <- apply(EM, 1, function(xx) any(is.na(xx)))
  nr_missing <- sum(ind)
  EM <- as.matrix(EM[!ind, , drop = FALSE])

  # objective and gradient functions
  objfn <- function(alpha, X) {
    sum(exp(X %*% alpha))
  }
  gradfn <- function(alpha, X) {
    colSums(sweep(X, 1, exp(X %*% alpha), "*"))
  }

@@ -111,13 +111,15 @@
 #' less weight in the re-weighted population than the original data.
 #'
 #' @param wt a numeric vector of individual MAIC weights (derived using \code{\link{estimate_weights}})
+#' @param bin_col a string, color for the bins of histogram
+#' @param vline_col a string, color for the vertical line in the histogram
 #' @param main_title a character string, main title of the plot
 #'
 #' @return a plot of unscaled or scaled weights
 #' @importFrom graphics hist
 #' @export
 
-plot_weights <- function(wt, main_title = "Unscaled Individual Weights") {
+plot_weights <- function(wt, bin_col = "#6ECEB2", vline_col = "#688CE8", main_title = "Unscaled Individual Weights") {
   # calculate sample size and exclude NA from wt
   nr_na <- sum(is.na(wt))
   n <- length(wt) - nr_na
@@ -141,10 +143,10 @@
   }
 
   # plot
-  par(mfrow = c(1, 1), family = "HersheySerif", mgp = c(2.3, 0.5, 0), cex.axis = 0.9, cex.lab = 0.95, bty = "n")
-  hist(wt, border = "white", col = "#6ECEB2", main = main_title, breaks = 20, yaxt = "n", xlab = "")
+  par(mgp = c(2.3, 0.5, 0), cex.axis = 0.9, cex.lab = 0.95, bty = "n")
+  hist(wt, border = "white", col = bin_col, main = main_title, breaks = 20, yaxt = "n", xlab = "")
   axis(2, las = 1)
-  abline(v = median(wt), lty = 2, col = "#688CE8", lwd = 2)
+  abline(v = median(wt), lty = 2, col = vline_col, lwd = 2)
   legend("topright", bty = "n", lty = plot_lty, cex = 0.8, legend = plot_legend)
 }
 
@@ -154,35 +156,93 @@
 #' before and after adjustment.
 #'
 #' @param optimized object returned after calculating weights using \code{\link{estimate_weights}}
-#' @param match_cov covariates that should be checked to see if the IPD weighted average matches the aggregate data
-#' average. This could be same set of variables that were used to match or it can include variables that were not
-#' included to match (i.e. stratification variables)
-#' @param digits number of digits for rounding summary table
+#' @param processed_agd a data frame, object returned after using \code{\link{process_agd}} or
+#' aggregated data following the same naming convention
+#' @param mean_digits number of digits for rounding mean columns in the output
+#' @param prop_digits number of digits for rounding proportion columns in the output
+#' @param sd_digits number of digits for rounding mean columns in the output
+
+#'
+#' @import DescTools
 #'
 #' @return data.frame of weighted and unweighted covariate averages of the IPD
 #' @export
 
-
-
-check_weights <- function(optimized, match_cov, digits = 2) {
-  if (missing(match_cov)) stop("match_cov is missing. Covariates to check must be defined.")
+check_weights <- function(optimized, processed_agd, mean_digits = 2, prop_digits = 2, sd_digits = 3) {
   ipd_with_weights <- optimized$data
-
-  arm_names <- c("Unweighted IPD", "Weighted IPD")
-  ess <- c(nrow(ipd_with_weights), optimized$ess)
-
-  unweighted_cov <- sapply(ipd_with_weights[, match_cov, drop = FALSE], mean)
-
-  weighted_cov <- sapply(
-    ipd_with_weights[, match_cov, drop = FALSE],
-    weighted.mean,
-    w = ipd_with_weights$weights
+  match_cov <- optimized$centered_colnames
+
+  # if algorithm is correct, all centered columns should have a weighted summation to a very small number around zero
+  num_check <- ipd_with_weights$weights %*% as.matrix(ipd_with_weights[, match_cov, drop = FALSE])
+  num_check <- round(num_check, 4)
+
+  # for reporting
+  outdata <- data.frame(
+    covariate = gsub("_CENTERED$", "", match_cov),
+    match_stat = NA,
+    internal_trial = NA,
+    internal_trial_after_weighted = NA,
+    external_trial = NA,
+    sum_centered_IPD_with_weights = as.vector(num_check)
   )
-
-  cov_combined <- rbind(unweighted_cov, weighted_cov)
-
-  baseline_summary <- cbind(ess, cov_combined)
-  rownames(baseline_summary) <- arm_names
-
-  round(baseline_summary, digits = digits)
+  attr(outdata, "footer") <- list()
+  ind_mean <- lapply(outdata$covariate, grep, x = names(processed_agd), value = TRUE) # find item that was matched by mean
+  ind_mean <- sapply(ind_mean, function(ii) any(grepl("_MEAN$", ii)))
+  outdata$match_stat <- ifelse(grepl("_MEDIAN$", outdata$covariate), "Median",
+    ifelse(grepl("_SQUARED$", outdata$covariate), "SD",
+      ifelse(ind_mean, "Mean", "Prop")
+    )
+  )
+  outdata$covariate <- gsub("_MEDIAN|_SQUARED", "", outdata$covariate)
+  # fill in corresponding agd data
+  outdata$external_trial <- unlist(processed_agd[paste(outdata$covariate, toupper(outdata$match_stat), sep = "_")])
+  outdata$external_trial[outdata$match_stat == "Mean"] <- round(outdata$external_trial[outdata$match_stat == "Mean"], mean_digits)
+  outdata$external_trial[outdata$match_stat == "Prop"] <- round(outdata$external_trial[outdata$match_stat == "Prop"], prop_digits)
+  outdata$external_trial[outdata$match_stat == "SD"] <- round(outdata$external_trial[outdata$match_stat == "SD"], sd_digits)
+  # fill in stat from unweighted and weighted IPD
+  for (ii in 1:nrow(outdata)) {
+    covname <- outdata$covariate[ii]
+    if (outdata$match_stat[ii] %in% c("Mean", "Prop")) {
+      outdata$internal_trial[ii] <- mean(ipd_with_weights[[covname]], na.rm = TRUE)
+      outdata$internal_trial_after_weighted[ii] <- weighted.mean(ipd_with_weights[[covname]], w = ipd_with_weights$weights, na.rm = TRUE)
+    } else if (outdata$match_stat[ii] == "Median") {
+      outdata$internal_trial[ii] <- quantile(ipd_with_weights[[covname]],
+        probs = 0.5,
+        na.rm = TRUE,
+        type = 2,
+        names = FALSE
+      ) # SAS default
+      outdata$internal_trial_after_weighted[ii] <- DescTools::Quantile(ipd_with_weights[[covname]],
+        weights = ipd_with_weights$weights,
+        probs = 0.5,
+        na.rm = TRUE,
+        type = 5,
+        names = FALSE
+      )
+      # no IPD equals to reported AgD median
+      msg_ind <- !any(ipd_with_weights[[covname]] == outdata$external_trial[ii], na.rm = TRUE)
+      if (msg_ind) {
+        msg_txt <- paste0(
+          "For covariate ", covname, ", it was matched to AgD median, but there is no IPD identical to AgD median,",
+          "hence median after weighted will not equal to AgD median exactly."
+        )
+        attr(outdata, "footer") <- c(attr(outdata, "footer"), msg_txt)
+      }
+    } else if (outdata$match_stat[ii] == "SD") {
+      outdata$internal_trial[ii] <- sd(ipd_with_weights[[covname]], na.rm = TRUE)
+      wm_squared <- weighted.mean(ipd_with_weights[[covname]]^2, w = ipd_with_weights$weights, na.rm = TRUE)
+      ms_agd <- processed_agd[[paste0(outdata$covariate[ii], "_MEAN")]]^2
+      outdata$internal_trial_after_weighted[ii] <- sqrt(wm_squared - ms_agd)
+    }
+  }
+  # formating
+
+  outdata$internal_trial[outdata$match_stat == "Mean"] <- round(outdata$internal_trial[outdata$match_stat == "Mean"], mean_digits)
+  outdata$internal_trial[outdata$match_stat == "Prop"] <- round(outdata$internal_trial[outdata$match_stat == "Prop"], prop_digits)
+  outdata$internal_trial[outdata$match_stat == "SD"] <- round(outdata$internal_trial[outdata$match_stat == "SD"], sd_digits)
+  outdata$internal_trial_after_weighted[outdata$match_stat == "Mean"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "Mean"], mean_digits)
+  outdata$internal_trial_after_weighted[outdata$match_stat == "Prop"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "Prop"], prop_digits)
+  outdata$internal_trial_after_weighted[outdata$match_stat == "SD"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "SD"], sd_digits)
+  # output
+  outdata
 }
diff --git a/R/process_data.R b/R/process_data.R
@@ -8,6 +8,8 @@
 #' Data is required to have three columns: STUDY, ARM, and N.
 #' Column names that do not have legal suffixes (MEAN, MEDIAN, SD, COUNT, or PROP) are dropped.
 #' If a variable is a count variable, it is converted to proportions by dividing the sample size (N).
+#' Note, when the count is specified, proportion is always calculated based on the count, that is,
+#' specified proportion will be ignored if applicable.
 #' If the aggregated data comes from multiple sources (i.e. different analysis population) and
 #' sample size differs for each variable, one option is to specify proportion directly instead of count by using suffix
 #' _PROP.
@@ -16,6 +18,22 @@
 #' by legal suffixes (i.e. MEAN, MEDIAN, SD, COUNT, or PROP).
 #'
 #' @examples
+#' # example
+#' target_pop <- read.csv(system.file("extdata", "aggregate_data_example_1.csv",
+#'   package = "maicplus", mustWork = TRUE
+#' ))
+#' target_pop2 <- read.csv(system.file("extdata", "aggregate_data_example_2.csv",
+#'   package = "maicplus", mustWork = TRUE
+#' ))
+#' target_pop3 <- read.csv(system.file("extdata", "aggregate_data_example_3.csv",
+#'   package = "maicplus", mustWork = TRUE
+#' ))
+#'
+#' target_pop <- process_agd(target_pop)
+#' target_pop2 <- process_agd(target_pop2)
+#' target_pop3 <- process_agd(target_pop3)
+#'
+#' # another example
 #' target_pop <- data.frame(
 #'   STUDY = "Study_XXXX",
 #'   ARM = "Total",
@@ -37,7 +55,7 @@ process_agd <- function(raw_agd) {
   # make all column names to be capital letters to avoid different style
   names(raw_agd) <- toupper(names(raw_agd))
 
-  # define column name patterns
+  # define column name patterns[-]
   must_exist <- c("STUDY", "ARM", "N")
   legal_suffix <- c("MEAN", "MEDIAN", "SD", "COUNT", "PROP")
 
@@ -81,10 +99,19 @@ process_agd <- function(raw_agd) {
   ind <- grepl("_COUNT$", names(use_agd))
   if (any(ind)) {
     for (i in which(ind)) {
-      use_agd[[i]] <- use_agd[[i]] / use_agd$N
+      tmp_prop <- use_agd[[i]] / use_agd$N
+      # in case some count are not specified, but proportion are specified, copy over those proportions
+      # this also means, in case count is specified, proportion is ignored even it is specified
+      prop_name_i <- gsub("_COUNT$", "_PROP", names(use_agd)[i])
+      if (prop_name_i %in% names(use_agd)) {
+        tmp_prop[is.na(tmp_prop)] <- use_agd[is.na(tmp_prop), prop_name_i]
+        names(use_agd)[names(use_agd) == prop_name_i] <- paste0(prop_name_i, "_redundant")
+      }
+      use_agd[[i]] <- tmp_prop
     }
     names(use_agd) <- gsub("_COUNT$", "_PROP", names(use_agd))
   }
+  use_agd <- use_agd[, !grepl("_redundant$", names(use_agd))]
 
   # output
   with(use_agd, use_agd[tolower(ARM) == "total", , drop = FALSE])
@@ -155,13 +182,12 @@ center_ipd <- function(ipd, agd) {
       if (grepl("_MEAN$|_PROP$", names(use_agd)[j])) {
         ipd[[paste0(ipd_param, "_", "CENTERED")]] <- ipd[[ipd_param]] - use_agd[[j]]
       } else if (grepl("_MEDIAN$", names(use_agd)[j])) {
-        ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
-        ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] - 0.5
+        ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
+        ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] - 0.5
       } else if (grepl("_SD$", names(use_agd)[j])) {
-        # this term is denoted as SD, but it is really a squared mean term
-        ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] <- ipd[[ipd_param]]^2
+        ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[ipd_param]]^2
         tmp_aim <- use_agd[[j]]^2 + (use_agd[[paste0(ipd_param, "_MEAN")]]^2)
-        ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] - tmp_aim
+        ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] - tmp_aim
       }
     } # end of j
   } # end of i
@@ -198,17 +224,17 @@ complete_agd <- function(use_agd) {
   NN <- use_agd[["N"]][rowId] <- sum(use_agd[["N"]], na.rm = TRUE)
   nn <- use_agd[["N"]][-rowId]
   for (i in grep("_COUNT$", names(use_agd), value = TRUE)) {
-    use_agd[[i]][rowId] <- sum(use_agd[[i]], na.rm = TRUE)
+    use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId], na.rm = TRUE)
   }
 
   # complete MEAN
   for (i in grep("_MEAN$", names(use_agd), value = TRUE)) {
-    use_agd[[i]][rowId] <- sum(use_agd[[i]] * nn) / NN
+    use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId] * nn) / NN
   }
 
   # complete SD
   for (i in grep("_SD$", names(use_agd), value = TRUE)) {
-    use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]]^2 * (nn - 1)) / (NN - 1))
+    use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]][-rowId]^2 * (nn - 1)) / (NN - 1))
   }
 
   # complete MEDIAN, approximately!!

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -24,3 +24,11 @@ ADaM
 unanchored
 TTE
 AgD
+reproducibility
+unscaled
+kaplan
+meier
+Remiro
+Azocar
+al
+et