Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update preprocess function and check weights #11

Merged
merged 17 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
with:
additional-r-cmd-check-params: --as-cran
coverage:
name: Coverage 📔
name: Coverage 📔
uses: insightsengineering/r.pkg.template/.github/workflows/test-coverage.yaml@main
secrets:
REPO_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -71,7 +71,7 @@ jobs:
uses: insightsengineering/r.pkg.template/.github/workflows/style.yaml@main
with:
auto-update: true
grammar:
if: github.event_name == 'pull_request'
name: Grammar Check 🔤
uses: insightsengineering/r.pkg.template/.github/workflows/grammar.yaml@main
# grammar:
# if: github.event_name == 'pull_request'
# name: Grammar Check 🔤
# uses: insightsengineering/r.pkg.template/.github/workflows/grammar.yaml@main
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Imports:
graphics,
stats,
survival,
DescTools,
MASS
Suggests:
knitr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(plot_weights)
export(process_agd)
export(resid_plot)
export(survfit_makeup)
import(DescTools)
import(stats)
importFrom(graphics,abline)
importFrom(graphics,axis)
Expand Down
118 changes: 89 additions & 29 deletions R/matching.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@

# prepare data for optimization
if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
EM <- data[, centered_colnames, drop = FALSE]

Check warning on line 57 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=57,col=3,[object_name_linter] Variable and function name style should be snake_case or symbols.
ind <- apply(EM, 1, function(xx) any(is.na(xx)))
nr_missing <- sum(ind)
EM <- as.matrix(EM[!ind, , drop = FALSE])

Check warning on line 60 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=60,col=3,[object_name_linter] Variable and function name style should be snake_case or symbols.

# objective and gradient functions
objfn <- function(alpha, X) {

Check warning on line 63 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=63,col=28,[object_name_linter] Variable and function name style should be snake_case or symbols.
sum(exp(X %*% alpha))
}
gradfn <- function(alpha, X) {

Check warning on line 66 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=66,col=29,[object_name_linter] Variable and function name style should be snake_case or symbols.
colSums(sweep(X, 1, exp(X %*% alpha), "*"))
}

Expand Down Expand Up @@ -111,13 +111,15 @@
#' less weight in the re-weighted population than the original data.
#'
#' @param wt a numeric vector of individual MAIC weights (derived using \code{\link{estimate_weights}})
#' @param bin_col a string, color for the bins of histogram
#' @param vline_col a string, color for the vertical line in the histogram
#' @param main_title a character string, main title of the plot
#'
#' @return a plot of unscaled or scaled weights
#' @importFrom graphics hist
#' @export

plot_weights <- function(wt, main_title = "Unscaled Individual Weights") {
plot_weights <- function(wt, bin_col = "#6ECEB2", vline_col = "#688CE8", main_title = "Unscaled Individual Weights") {
hoppanda marked this conversation as resolved.
Show resolved Hide resolved
# calculate sample size and exclude NA from wt
nr_na <- sum(is.na(wt))
n <- length(wt) - nr_na
Expand All @@ -141,10 +143,10 @@
}

# plot
par(mfrow = c(1, 1), family = "HersheySerif", mgp = c(2.3, 0.5, 0), cex.axis = 0.9, cex.lab = 0.95, bty = "n")
hist(wt, border = "white", col = "#6ECEB2", main = main_title, breaks = 20, yaxt = "n", xlab = "")
par(mgp = c(2.3, 0.5, 0), cex.axis = 0.9, cex.lab = 0.95, bty = "n")
hist(wt, border = "white", col = bin_col, main = main_title, breaks = 20, yaxt = "n", xlab = "")
axis(2, las = 1)
abline(v = median(wt), lty = 2, col = "#688CE8", lwd = 2)
abline(v = median(wt), lty = 2, col = vline_col, lwd = 2)
legend("topright", bty = "n", lty = plot_lty, cex = 0.8, legend = plot_legend)
}

Expand All @@ -154,35 +156,93 @@
#' before and after adjustment.
#'
#' @param optimized object returned after calculating weights using \code{\link{estimate_weights}}
#' @param match_cov covariates that should be checked to see if the IPD weighted average matches the aggregate data
#' average. This could be same set of variables that were used to match or it can include variables that were not
#' included to match (i.e. stratification variables)
#' @param digits number of digits for rounding summary table
#' @param processed_agd a data frame, object returned after using \code{\link{process_agd}} or
#' aggregated data following the same naming convention
#' @param mean_digits number of digits for rounding mean columns in the output
#' @param prop_digits number of digits for rounding proportion columns in the output
#' @param sd_digits number of digits for rounding mean columns in the output

#'
#' @import DescTools
#'
#' @return data.frame of weighted and unweighted covariate averages of the IPD
#' @export



check_weights <- function(optimized, match_cov, digits = 2) {
if (missing(match_cov)) stop("match_cov is missing. Covariates to check must be defined.")
check_weights <- function(optimized, processed_agd, mean_digits = 2, prop_digits = 2, sd_digits = 3) {
ipd_with_weights <- optimized$data

arm_names <- c("Unweighted IPD", "Weighted IPD")
ess <- c(nrow(ipd_with_weights), optimized$ess)

unweighted_cov <- sapply(ipd_with_weights[, match_cov, drop = FALSE], mean)

weighted_cov <- sapply(
ipd_with_weights[, match_cov, drop = FALSE],
weighted.mean,
w = ipd_with_weights$weights
match_cov <- optimized$centered_colnames

# if algorithm is correct, all centered columns should have a weighted summation to a very small number around zero
num_check <- ipd_with_weights$weights %*% as.matrix(ipd_with_weights[, match_cov, drop = FALSE])
num_check <- round(num_check, 4)

# for reporting
outdata <- data.frame(
covariate = gsub("_CENTERED$", "", match_cov),
match_stat = NA,
internal_trial = NA,
internal_trial_after_weighted = NA,
external_trial = NA,
sum_centered_IPD_with_weights = as.vector(num_check)
hoppanda marked this conversation as resolved.
Show resolved Hide resolved
)

cov_combined <- rbind(unweighted_cov, weighted_cov)

baseline_summary <- cbind(ess, cov_combined)
rownames(baseline_summary) <- arm_names

round(baseline_summary, digits = digits)
attr(outdata, "footer") <- list()
ind_mean <- lapply(outdata$covariate, grep, x = names(processed_agd), value = TRUE) # find item that was matched by mean

Check warning on line 189 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=189,col=121,[line_length_linter] Lines should not be more than 120 characters.
ind_mean <- sapply(ind_mean, function(ii) any(grepl("_MEAN$", ii)))
outdata$match_stat <- ifelse(grepl("_MEDIAN$", outdata$covariate), "Median",
ifelse(grepl("_SQUARED$", outdata$covariate), "SD",
ifelse(ind_mean, "Mean", "Prop")
)
)
outdata$covariate <- gsub("_MEDIAN|_SQUARED", "", outdata$covariate)
# fill in corresponding agd data
outdata$external_trial <- unlist(processed_agd[paste(outdata$covariate, toupper(outdata$match_stat), sep = "_")])
outdata$external_trial[outdata$match_stat == "Mean"] <- round(outdata$external_trial[outdata$match_stat == "Mean"], mean_digits)

Check warning on line 199 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=199,col=121,[line_length_linter] Lines should not be more than 120 characters.
outdata$external_trial[outdata$match_stat == "Prop"] <- round(outdata$external_trial[outdata$match_stat == "Prop"], prop_digits)

Check warning on line 200 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=200,col=121,[line_length_linter] Lines should not be more than 120 characters.
outdata$external_trial[outdata$match_stat == "SD"] <- round(outdata$external_trial[outdata$match_stat == "SD"], sd_digits)

Check warning on line 201 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=201,col=121,[line_length_linter] Lines should not be more than 120 characters.
# fill in stat from unweighted and weighted IPD
for (ii in 1:nrow(outdata)) {

Check warning on line 203 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=203,col=14,[seq_linter] 1:nrow(...) is likely to be wrong in the empty edge case. Use seq_len(nrow(...)) instead.
covname <- outdata$covariate[ii]
if (outdata$match_stat[ii] %in% c("Mean", "Prop")) {
outdata$internal_trial[ii] <- mean(ipd_with_weights[[covname]], na.rm = TRUE)
outdata$internal_trial_after_weighted[ii] <- weighted.mean(ipd_with_weights[[covname]], w = ipd_with_weights$weights, na.rm = TRUE)

Check warning on line 207 in R/matching.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / SuperLinter 🦸‍♂️

file=/github/workspace/R/matching.R,line=207,col=121,[line_length_linter] Lines should not be more than 120 characters.
} else if (outdata$match_stat[ii] == "Median") {
outdata$internal_trial[ii] <- quantile(ipd_with_weights[[covname]],
probs = 0.5,
na.rm = TRUE,
type = 2,
names = FALSE
) # SAS default
outdata$internal_trial_after_weighted[ii] <- DescTools::Quantile(ipd_with_weights[[covname]],
weights = ipd_with_weights$weights,
probs = 0.5,
na.rm = TRUE,
type = 5,
names = FALSE
)
# no IPD equals to reported AgD median
msg_ind <- !any(ipd_with_weights[[covname]] == outdata$external_trial[ii], na.rm = TRUE)
if (msg_ind) {
msg_txt <- paste0(
"For covariate ", covname, ", it was matched to AgD median, but there is no IPD identical to AgD median,",
"hence median after weighted will not equal to AgD median exactly."
)
attr(outdata, "footer") <- c(attr(outdata, "footer"), msg_txt)
}
} else if (outdata$match_stat[ii] == "SD") {
outdata$internal_trial[ii] <- sd(ipd_with_weights[[covname]], na.rm = TRUE)
wm_squared <- weighted.mean(ipd_with_weights[[covname]]^2, w = ipd_with_weights$weights, na.rm = TRUE)
ms_agd <- processed_agd[[paste0(outdata$covariate[ii], "_MEAN")]]^2
outdata$internal_trial_after_weighted[ii] <- sqrt(wm_squared - ms_agd)
}
}
# formating

outdata$internal_trial[outdata$match_stat == "Mean"] <- round(outdata$internal_trial[outdata$match_stat == "Mean"], mean_digits)
outdata$internal_trial[outdata$match_stat == "Prop"] <- round(outdata$internal_trial[outdata$match_stat == "Prop"], prop_digits)
outdata$internal_trial[outdata$match_stat == "SD"] <- round(outdata$internal_trial[outdata$match_stat == "SD"], sd_digits)
outdata$internal_trial_after_weighted[outdata$match_stat == "Mean"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "Mean"], mean_digits)
outdata$internal_trial_after_weighted[outdata$match_stat == "Prop"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "Prop"], prop_digits)
outdata$internal_trial_after_weighted[outdata$match_stat == "SD"] <- round(outdata$internal_trial_after_weighted[outdata$match_stat == "SD"], sd_digits)
# output
outdata
}
46 changes: 36 additions & 10 deletions R/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#' Data is required to have three columns: STUDY, ARM, and N.
#' Column names that do not have legal suffixes (MEAN, MEDIAN, SD, COUNT, or PROP) are dropped.
#' If a variable is a count variable, it is converted to proportions by dividing the sample size (N).
#' Note, when the count is specified, proportion is always calculated based on the count, that is,
#' specified proportion will be ignored if applicable.
#' If the aggregated data comes from multiple sources (i.e. different analysis population) and
#' sample size differs for each variable, one option is to specify proportion directly instead of count by using suffix
#' _PROP.
Expand All @@ -16,6 +18,22 @@
#' by legal suffixes (i.e. MEAN, MEDIAN, SD, COUNT, or PROP).
#'
#' @examples
#' # example
#' target_pop <- read.csv(system.file("extdata", "aggregate_data_example_1.csv",
#' package = "maicplus", mustWork = TRUE
#' ))
#' target_pop2 <- read.csv(system.file("extdata", "aggregate_data_example_2.csv",
#' package = "maicplus", mustWork = TRUE
#' ))
#' target_pop3 <- read.csv(system.file("extdata", "aggregate_data_example_3.csv",
#' package = "maicplus", mustWork = TRUE
#' ))
#'
#' target_pop <- process_agd(target_pop)
#' target_pop2 <- process_agd(target_pop2)
#' target_pop3 <- process_agd(target_pop3)
#'
#' # another example
#' target_pop <- data.frame(
#' STUDY = "Study_XXXX",
#' ARM = "Total",
Expand All @@ -37,7 +55,7 @@ process_agd <- function(raw_agd) {
# make all column names to be capital letters to avoid different style
names(raw_agd) <- toupper(names(raw_agd))

# define column name patterns
# define column name patterns[-]
must_exist <- c("STUDY", "ARM", "N")
legal_suffix <- c("MEAN", "MEDIAN", "SD", "COUNT", "PROP")

Expand Down Expand Up @@ -81,10 +99,19 @@ process_agd <- function(raw_agd) {
ind <- grepl("_COUNT$", names(use_agd))
if (any(ind)) {
for (i in which(ind)) {
use_agd[[i]] <- use_agd[[i]] / use_agd$N
tmp_prop <- use_agd[[i]] / use_agd$N
# in case some count are not specified, but proportion are specified, copy over those proportions
# this also means, in case count is specified, proportion is ignored even it is specified
prop_name_i <- gsub("_COUNT$", "_PROP", names(use_agd)[i])
if (prop_name_i %in% names(use_agd)) {
tmp_prop[is.na(tmp_prop)] <- use_agd[is.na(tmp_prop), prop_name_i]
names(use_agd)[names(use_agd) == prop_name_i] <- paste0(prop_name_i, "_redundant")
}
use_agd[[i]] <- tmp_prop
}
names(use_agd) <- gsub("_COUNT$", "_PROP", names(use_agd))
}
use_agd <- use_agd[, !grepl("_redundant$", names(use_agd))]

# output
with(use_agd, use_agd[tolower(ARM) == "total", , drop = FALSE])
Expand Down Expand Up @@ -155,13 +182,12 @@ center_ipd <- function(ipd, agd) {
if (grepl("_MEAN$|_PROP$", names(use_agd)[j])) {
ipd[[paste0(ipd_param, "_", "CENTERED")]] <- ipd[[ipd_param]] - use_agd[[j]]
} else if (grepl("_MEDIAN$", names(use_agd)[j])) {
ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MED_", "CENTERED")]] - 0.5
ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] - 0.5
} else if (grepl("_SD$", names(use_agd)[j])) {
# this term is denoted as SD, but it is really a squared mean term
ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] <- ipd[[ipd_param]]^2
ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[ipd_param]]^2
tmp_aim <- use_agd[[j]]^2 + (use_agd[[paste0(ipd_param, "_MEAN")]]^2)
ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SD_", "CENTERED")]] - tmp_aim
ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] - tmp_aim
}
} # end of j
} # end of i
Expand Down Expand Up @@ -198,17 +224,17 @@ complete_agd <- function(use_agd) {
NN <- use_agd[["N"]][rowId] <- sum(use_agd[["N"]], na.rm = TRUE)
nn <- use_agd[["N"]][-rowId]
for (i in grep("_COUNT$", names(use_agd), value = TRUE)) {
use_agd[[i]][rowId] <- sum(use_agd[[i]], na.rm = TRUE)
use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId], na.rm = TRUE)
}

# complete MEAN
for (i in grep("_MEAN$", names(use_agd), value = TRUE)) {
use_agd[[i]][rowId] <- sum(use_agd[[i]] * nn) / NN
use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId] * nn) / NN
}

# complete SD
for (i in grep("_SD$", names(use_agd), value = TRUE)) {
use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]]^2 * (nn - 1)) / (NN - 1))
use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]][-rowId]^2 * (nn - 1)) / (NN - 1))
}

# complete MEDIAN, approximately!!
Expand Down
8 changes: 8 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@ ADaM
unanchored
TTE
AgD
reproducibility
unscaled
kaplan
meier
Remiro
Azocar
al
et
Loading
Loading