From eb248e0b3309062b69d2c4846b03de71c098bb02 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 1 Feb 2024 19:44:17 +0000 Subject: [PATCH 01/24] Initial addition of data limiting function --- NAMESPACE | 1 + R/count.R | 20 +++++++++++++++++--- R/count_bindings.R | 24 ++++++++++++++++++++++++ man/set_limit_data_by.Rd | 18 ++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 man/set_limit_data_by.Rd diff --git a/NAMESPACE b/NAMESPACE index d2b779eb..4e901e36 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -101,6 +101,7 @@ export(set_distinct_by) export(set_format_strings) export(set_header_n) export(set_indentation) +export(set_limit_data_by) export(set_missing_count) export(set_nest_count) export(set_numeric_threshold) diff --git a/R/count.R b/R/count.R index 030db51d..ae5ff2de 100644 --- a/R/count.R +++ b/R/count.R @@ -121,7 +121,7 @@ process_summaries.count_layer <- function(x, ...) { #' If include_total_row is true a row will be added with a total row labeled #' with total_row_label. #' -#' Complete is used to complete the combinaions of by, treat_var, and target_var +#' Complete is used to complete the combinations of by, treat_var, and target_var #' #' @noRd process_single_count_target <- function(x) { @@ -256,11 +256,25 @@ process_count_n <- function(x) { names(missing_count_list))) } - summary_stat <- summary_stat %>% + complete_levels <- summary_stat %>% # complete all combinations of factors to include combinations that don't exist. # add 0 for combinations that don't exist complete(!!treat_var, !!!by, !!!target_var, !!!cols, - fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) %>% + fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) + + # Apply data limits specified by setter + if (exists("limit_data_by")) { + # Find the combinations actually in the data + groups_in_data <- summary_stat %>% + distinct(!!!limit_data_by) + + # Join back to limit the completed levels based on the preferred + # data driven ones + complete_levels <- groups_in_data %>% + left_join(complete_levels, by = map_chr(limit_data_by, as_name)) + } + + summary_stat <- complete_levels %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% diff --git a/R/count_bindings.R b/R/count_bindings.R index 197c6757..32d08e3e 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -711,3 +711,27 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { e } + +#' Set variables to limit to data values only rather than fully completing all +#' possible levels +#' +#' @param e A tplyr_layer +#' @param ... Subset of variables within by or target variables +#' +#' @return +#' @export +set_limit_data_by <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + tv_ <- map_chr(env_get(e, "target_var"), as_name) + + if (!all(dots_chr %in% c(by_, tv_))) { + stop("Complete by variables must be included in by variables set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} diff --git a/man/set_limit_data_by.Rd b/man/set_limit_data_by.Rd new file mode 100644 index 00000000..76fb14e0 --- /dev/null +++ b/man/set_limit_data_by.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{set_limit_data_by} +\alias{set_limit_data_by} +\title{Set variables to limit to data values only rather than fully completing all +possible levels} +\usage{ +set_limit_data_by(e, ...) +} +\arguments{ +\item{e}{A tplyr_layer} + +\item{...}{Subset of variables within by or target variables} +} +\description{ +Set variables to limit to data values only rather than fully completing all +possible levels +} From 9e15574265a3f7576cbe22282e924b444c88db74 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Fri, 2 Feb 2024 21:51:10 +0000 Subject: [PATCH 02/24] Separate to genericized function, add test file --- NAMESPACE | 2 + R/count.R | 19 +----- R/count_bindings.R | 24 -------- R/desc.R | 24 ++++++-- R/set_limit_data_by.R | 82 +++++++++++++++++++++++++ R/zzz.R | 1 + man/set_limit_data_by.Rd | 2 +- tests/testthat/test-set_limit_data_by.R | 64 +++++++++++++++++++ 8 files changed, 173 insertions(+), 45 deletions(-) create mode 100644 R/set_limit_data_by.R create mode 100644 tests/testthat/test-set_limit_data_by.R diff --git a/NAMESPACE b/NAMESPACE index 4e901e36..fe7924b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,8 @@ S3method(set_denoms_by,shift_layer) S3method(set_format_strings,count_layer) S3method(set_format_strings,desc_layer) S3method(set_format_strings,shift_layer) +S3method(set_limit_data_by,count_layer) +S3method(set_limit_data_by,desc_layer) S3method(set_where,tplyr_layer) S3method(set_where,tplyr_table) S3method(str,f_str) diff --git a/R/count.R b/R/count.R index ae5ff2de..8a876513 100644 --- a/R/count.R +++ b/R/count.R @@ -257,22 +257,9 @@ process_count_n <- function(x) { } complete_levels <- summary_stat %>% - # complete all combinations of factors to include combinations that don't exist. - # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!target_var, !!!cols, - fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) - - # Apply data limits specified by setter - if (exists("limit_data_by")) { - # Find the combinations actually in the data - groups_in_data <- summary_stat %>% - distinct(!!!limit_data_by) - - # Join back to limit the completed levels based on the preferred - # data driven ones - complete_levels <- groups_in_data %>% - left_join(complete_levels, by = map_chr(limit_data_by, as_name)) - } + complete_and_limit(treat_var, by, target_var, cols, + limit = exists("limit_data_by"), + .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) summary_stat <- complete_levels %>% # Change the treat_var and first target_var to characters to resolve any diff --git a/R/count_bindings.R b/R/count_bindings.R index 32d08e3e..197c6757 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -711,27 +711,3 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { e } - -#' Set variables to limit to data values only rather than fully completing all -#' possible levels -#' -#' @param e A tplyr_layer -#' @param ... Subset of variables within by or target variables -#' -#' @return -#' @export -set_limit_data_by <- function(e, ...) { - dots <- enquos(...) - dots_chr <- map_chr(dots, as_name) - - # Pull these variables to make sure the denoms used make sense - by_ <- map_chr(env_get(e, "by"), as_name) - tv_ <- map_chr(env_get(e, "target_var"), as_name) - - if (!all(dots_chr %in% c(by_, tv_))) { - stop("Complete by variables must be included in by variables set on layer", call.=FALSE) - } - - env_bind(e, limit_data_by = dots) - e -} diff --git a/R/desc.R b/R/desc.R index 00922c6a..829da546 100644 --- a/R/desc.R +++ b/R/desc.R @@ -49,16 +49,32 @@ process_summaries.desc_layer <- function(x, ...) { summaries <- get_summaries()[match_exact(summary_vars)] # Create the numeric summary data - num_sums_raw[[i]] <- built_target %>% + cmplt1 <- built_target %>% # Rename the current variable to make each iteration use a generic name rename(.var = !!cur_var) %>% # Group by treatment, provided by variable, and provided column variables group_by(!!treat_var, !!!by, !!!cols) %>% # Execute the summaries summarize(!!!summaries) %>% - ungroup() %>% - # Fill in any missing treat/col combinations - complete(!!treat_var, !!!by, !!!cols) + ungroup() + + num_sums_raw[[i]] <- complete_and_limit(cmplt1, treat_var, by, cols, limit_data_by=limit_data_by) + # num_sums_raw[[i]] <- cmplt1 %>% + # # complete all combinations of factors to include combinations that don't exist. + # # add 0 for combinations that don't exist + # complete(!!treat_var, !!!by, !!!cols) + # + # # Apply data limits specified by setter + # if (exists("limit_data_by")) { + # # Find the combinations actually in the data + # groups_in_data <- cmplt1 %>% + # distinct(!!!limit_data_by) + # + # # Join back to limit the completed levels based on the preferred + # # data driven ones + # num_sums_raw[[i]] <- groups_in_data %>% + # left_join(num_sums_raw[[i]], by = map_chr(limit_data_by, as_name)) + # } # Create the transposed summary data to prepare for formatting trans_sums[[i]] <- num_sums_raw[[i]] %>% diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R new file mode 100644 index 00000000..ac08047e --- /dev/null +++ b/R/set_limit_data_by.R @@ -0,0 +1,82 @@ +#' Set variables to limit to data values only rather than fully completing all +#' possible levels +#' +#' @param e A tplyr_layer +#' @param ... Subset of variables within by or target variables +#' +#' @return +#' @export +set_limit_data_by <- function(e, ...) { + UseMethod("set_limit_data_by") +} + +#' @export +#' @noRd +set_limit_data_by.count_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + tv_ <- map_chr(env_get(e, "target_var"), as_name) + + if (!all(dots_chr %in% c(by_, tv_))) { + stop("Limit data by variables must be included in by variables or target variable set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' @export +#' @noRd +set_limit_data_by.desc_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + + if (!all(dots_chr %in% by_)) { + stop("Limit data by variables must be included in by variables set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' General function used to process the steps to pad levels in data, or limit to +#' combinations available in the data itself +#' +#' @param dat +#' @param treat_var +#' @param by +#' @param cols +#' @param target_var +#' @param limit +#' @param .fill +#' TODO: Figure out best way to pass the data limiting into this function, because it doesn't exist unless distinctly set. +complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list()) { + + complete_levels <- dat %>% + # complete all combinations of factors to include combinations that don't exist. + # add 0 for combinations that don't exist + complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, + fill = .fill) + + # Apply data limits specified by setter + if (!is.null(limit_data_by)) { + # Find the combinations actually in the data + groups_in_data <- dat %>% + distinct(!!!limit_data_by) + + # Join back to limit the completed levels based on the preferred + # data driven ones + limited_data <- groups_in_data %>% + left_join(complete_levels, by = map_chr(limit_data_by, as_name)) + + return(limited_data) + } + + complete_levels +} diff --git a/R/zzz.R b/R/zzz.R index 70df2747..cf00b115 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -307,4 +307,5 @@ og_row <- NULL desc <- NULL id <- NULL stub_sort <- NULL +limit_data_by <- NULL diff --git a/man/set_limit_data_by.Rd b/man/set_limit_data_by.Rd index 76fb14e0..49203e21 100644 --- a/man/set_limit_data_by.Rd +++ b/man/set_limit_data_by.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/count_bindings.R +% Please edit documentation in R/set_limit_data_by.R \name{set_limit_data_by} \alias{set_limit_data_by} \title{Set variables to limit to data values only rather than fully completing all diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R new file mode 100644 index 00000000..792e4389 --- /dev/null +++ b/tests/testthat/test-set_limit_data_by.R @@ -0,0 +1,64 @@ +library(dplyr) + +adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, + "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, + "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, + "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, + "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, + "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, + "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, + "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, + "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, + "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, + "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, + "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, + "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, + + "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, + "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, + "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, + "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, + "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, + "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, + "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, + "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, + "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21 +) + +adpe$AVALC <- factor(adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) + +test_that("Descriptive statistics data limiting works properly", { + t1 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(18, 18, 18, 18)) + + t2 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(6, 18, 6, 18)) + + + t3 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(6, 18, 18)) +}) + From 5ae408c0ea8d61d6591aea863a7c0b2146e3c025 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Fri, 2 Feb 2024 21:59:45 +0000 Subject: [PATCH 03/24] Finish count layer basic testing --- R/count.R | 3 +-- tests/testthat/test-set_limit_data_by.R | 36 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/R/count.R b/R/count.R index 8a876513..4118dab8 100644 --- a/R/count.R +++ b/R/count.R @@ -257,8 +257,7 @@ process_count_n <- function(x) { } complete_levels <- summary_stat %>% - complete_and_limit(treat_var, by, target_var, cols, - limit = exists("limit_data_by"), + complete_and_limit(treat_var, by, cols, target_var, limit_data_by, .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) summary_stat <- complete_levels %>% diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R index 792e4389..bc79c52d 100644 --- a/tests/testthat/test-set_limit_data_by.R +++ b/tests/testthat/test-set_limit_data_by.R @@ -62,3 +62,39 @@ test_that("Descriptive statistics data limiting works properly", { expect_equal(cnts3$n, c(6, 18, 18)) }) +test_that("Descriptive statistics data limiting works properly", { + + t1 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) +}) + + From 52e9acf491dfc1b15888a3efed2493fc00b7a560 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Mon, 5 Feb 2024 19:36:03 +0000 Subject: [PATCH 04/24] Push development fixes for #173 --- R/sort.R | 110 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/R/sort.R b/R/sort.R index 1409f713..c1334c10 100644 --- a/R/sort.R +++ b/R/sort.R @@ -194,20 +194,28 @@ add_order_columns.count_layer <- function(x) { expr(!!sym(as_name(x)) == !!as_name(y)) }) + # Get the number of unique outer values, that is the number of rows to pull out. # If its text, it is just 1 to pull out - outer_number <- ifelse(quo_is_symbol(by[[1]]), - # Use built_target here to take the 'where' logic into account - length(unlist(unique(built_target[, as_name(by[[1]])]))), - 1) + # outer_number <- ifelse(quo_is_symbol(by[[1]]), + # # Use built_target here to take the 'where' logic into account + # nrow(filter(numeric_data, is.na(!!by[[1]]))), + # 1) + + # Identify the outer layer and attach it to the filter logic + filter_logic <- append(filter_logic, ifelse( + quo_is_symbol(by[[1]]), # Is the outside variable character or a symbol? + exprs(is.na(!!by[[1]])), # For symbols, the outer var will be NA + exprs(summary_var == !!by[[1]]) # For character, it will match summary_var + )) all_outer <- numeric_data %>% - filter(!!!filter_logic) %>% - extract(1:min(nrow(.), outer_number), ) + filter(!!!filter_logic) # Add the ordering of the pieces in the layer formatted_data <- formatted_data %>% - group_by(.data[[paste0("row_label", formatted_col_index - 1)]]) %>% + # group_by(.data[[paste0("row_label", formatted_col_index - 1)]]) %>% + group_by(row_label1) %>% do(add_data_order_nested(., formatted_col_index - 1, numeric_data, indentation_length = indentation_length, ordering_cols = ordering_cols, @@ -701,13 +709,15 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { all_outer$..index <- group_data[1,] %>% get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value) - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) + group_data[ + group_data[[tail(row_label_vec, 1)]] == outer_value, + paste0("ord_layer_", final_col) + ] <- all_outer %>% + filter(summary_var == outer_value) %>% + ungroup() %>% + select(..index) } else if(order_count_method[1] == "bycount") { - all_outer$..index <- all_outer %>% get_data_order_bycount(ordering_cols, treat_var, vars(!!!head(by, -1)), cols, result_order_var, vars(!!by[[1]], !!target_var), @@ -717,13 +727,20 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { numeric_cutoff_column = numeric_cutoff_column, nested = TRUE) - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) + group_data[ + group_data[[tail(row_label_vec, 1)]] == outer_value, + paste0("ord_layer_", final_col) + ] <- all_outer %>% + filter(summary_var == outer_value) %>% + ungroup() %>% + select(..index) } - present_vars <- unlist(group_data[-1, row_label_vec[length(row_label_vec)]]) + outer_nest_rows <- group_data %>% + filter(!!sym(tail(row_label_vec, 1)) == outer_value) %>% + nrow() + + present_vars <- group_data[(outer_nest_rows + 1): nrow(group_data),][[row_label_vec[length(row_label_vec)]]] ##### Inner nest values ##### filtered_numeric_data <- numeric_data %>% @@ -732,48 +749,53 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { # Remove nesting prefix to prepare numeric data. mutate(summary_var := str_sub(summary_var, indentation_length)) - #Same idea here, remove prefix - filtered_group_data <- group_data[-1, ] %>% + filtered_group_data <- tail(group_data, -outer_nest_rows) %>% mutate(!!row_label_vec[length(row_label_vec)] := str_sub(.data[[row_label_vec[length(row_label_vec)]]], indentation_length + 1)) + # The first row is always the first thing in the order so make it Inf - group_data[1, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) + group_data[1:outer_nest_rows, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) if(tail(order_count_method, 1) == "bycount") { if (nrow(group_data) > 1) { - group_data[-1 , paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, - ordering_cols, - treat_var, - head(by, -1), - cols, - result_order_var, - target_var, - break_ties = break_ties, - numeric_cutoff = numeric_cutoff, - numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column, - nested = TRUE) - } - } else if(tail(order_count_method, 1) == "byvarn") { + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, + ordering_cols, + treat_var, + head(by, -1), + cols, + result_order_var, + target_var, + break_ties = break_ties, + numeric_cutoff = numeric_cutoff, + numeric_cutoff_stat = numeric_cutoff_stat, + numeric_cutoff_column = numeric_cutoff_column, + nested = TRUE) + } } else if(tail(order_count_method, 1) == "byvarn") { varn_df <- get_varn_values(target, target_var[[1]]) - group_data[-1, paste0("ord_layer_", final_col + 1)] <- get_data_order_byvarn(filtered_group_data, - varn_df, - target_var[[1]], - length(by) + 1, - indentation, - total_row_sort_value = total_row_sort_value) - + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- get_data_order_byvarn(filtered_group_data, + varn_df, + target_var[[1]], + length(by) + 1, + indentation, + total_row_sort_value = total_row_sort_value) } else { - - group_row_count <- nrow(group_data[-1,]) + group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) # Logic for group_row_count is when numeric_where values cause unexpected results - group_data[-1, paste0("ord_layer_", final_col + 1)] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- 1:ifelse(group_row_count == 0, 1, group_row_count) } From a7a5365d7c8ff596f399ad4ea6922f140043ce92 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Mon, 5 Feb 2024 21:56:12 +0000 Subject: [PATCH 05/24] I think I fixed this nightmare. --- R/sort.R | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/R/sort.R b/R/sort.R index c1334c10..c940d04c 100644 --- a/R/sort.R +++ b/R/sort.R @@ -702,6 +702,8 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { ##### Outer nest values ##### # The value of the outer label outer_value <- group_data[1, tail(row_label_vec, 1)][[1]] + # Reserve for joins + mrg_by <- paste0("row_label", seq_along(by))[-1] if(order_count_method[1] == "byvarn") { varn_df <- get_varn_values(target, as_name(by[[1]])) @@ -709,13 +711,7 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { all_outer$..index <- group_data[1,] %>% get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value) - group_data[ - group_data[[tail(row_label_vec, 1)]] == outer_value, - paste0("ord_layer_", final_col) - ] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) + } else if(order_count_method[1] == "bycount") { all_outer$..index <- all_outer %>% @@ -726,14 +722,27 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { numeric_cutoff_stat = numeric_cutoff_stat, numeric_cutoff_column = numeric_cutoff_column, nested = TRUE) + } - group_data[ - group_data[[tail(row_label_vec, 1)]] == outer_value, - paste0("ord_layer_", final_col) - ] <- all_outer %>% + + # Grab the index created above and insert it into group data + if (order_count_method[1] %in% c("bycount", "byvarn")){ + if (length(mrg_by) == 0) { + group_data[,paste0("ord_layer_", final_col)] <- all_outer %>% filter(summary_var == outer_value) %>% ungroup() %>% - select(..index) + pull(..index) + } else { + group_data[,paste0("ord_layer_", final_col)] <- group_data %>% + left_join( + all_outer %>% + filter(summary_var == outer_value) %>% + replace_by_string_names(c(by, quo(summary_var))) %>% + select(starts_with('row'), ..index, -c(row_label1, !!treat_var)), + by = mrg_by + ) %>% + pull(..index) + } } outer_nest_rows <- group_data %>% From 753124a543a4e5079848dde9b07974fa2ea536a9 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Tue, 6 Feb 2024 13:49:30 +0000 Subject: [PATCH 06/24] Last update and test added. --- R/sort.R | 4 ++-- tests/testthat/test-sort.R | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/R/sort.R b/R/sort.R index c940d04c..4084716b 100644 --- a/R/sort.R +++ b/R/sort.R @@ -728,12 +728,12 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { # Grab the index created above and insert it into group data if (order_count_method[1] %in% c("bycount", "byvarn")){ if (length(mrg_by) == 0) { - group_data[,paste0("ord_layer_", final_col)] <- all_outer %>% + group_data[,"ord_layer_1"] <- all_outer %>% filter(summary_var == outer_value) %>% ungroup() %>% pull(..index) } else { - group_data[,paste0("ord_layer_", final_col)] <- group_data %>% + group_data[,"ord_layer_1"] <- group_data %>% left_join( all_outer %>% filter(summary_var == outer_value) %>% diff --git a/tests/testthat/test-sort.R b/tests/testthat/test-sort.R index c77458dd..e0236cc0 100644 --- a/tests/testthat/test-sort.R +++ b/tests/testthat/test-sort.R @@ -264,3 +264,39 @@ test_that("by variables get sorted with varn/factors in the correct order", { expect_equal(t3[["row_label1"]], c("1", "1", "1", "0", "0", "0")) expect_equal(t3[["ord_layer_1"]], c(1, 1, 1, 2, 2, 2)) }) + + +# Added to address #175 +test_that("Nested counts with by variables process properly", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) + %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df1 <- t_ae1 %>% + build() + + # This is verifying that the right number of combinations of row_labels exist, and that + # there aren't duplicate order values for the outer layer + expect_equal(nrow(count(t_ae_df1, row_label2, row_label3, ord_layer_3)), 6) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars("testing", AEDECOD), by=AEOUT) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df2 <- t_ae2 %>% + build() + + # Same test but now working with a text outer layer and one by variable + expect_equal(nrow(count(t_ae_df2, row_label2, ord_layer_2)), 2) + +}) From cc615e7da19eb72835eac4d7ed1bed4bbb0f02ef Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Tue, 6 Feb 2024 18:42:07 +0000 Subject: [PATCH 07/24] Handle nested counts properly --- R/count.R | 7 +++++- R/nested.R | 2 ++ R/set_limit_data_by.R | 24 ++++++++++++-------- tests/testthat/test-set_limit_data_by.R | 29 +++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/R/count.R b/R/count.R index 4118dab8..9d5cf2b0 100644 --- a/R/count.R +++ b/R/count.R @@ -101,6 +101,7 @@ process_summaries.count_layer <- function(x, ...) { process_count_denoms(x) + outer <- FALSE process_single_count_target(x) } @@ -256,9 +257,13 @@ process_count_n <- function(x) { names(missing_count_list))) } + # Need to mark this for nested counts + if (!exists('outer_')) outer_ <- FALSE + complete_levels <- summary_stat %>% complete_and_limit(treat_var, by, cols, target_var, limit_data_by, - .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) + .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0), + outer=outer_) summary_stat <- complete_levels %>% # Change the treat_var and first target_var to characters to resolve any diff --git a/R/nested.R b/R/nested.R index e05e854c..481b82b6 100644 --- a/R/nested.R +++ b/R/nested.R @@ -32,9 +32,11 @@ process_nested_count_target <- function(x) { second_denoms_by <- denoms_by } + outer_ <- TRUE first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], by = vars(!!!by), where = !!where)) + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% set_count_row_prefix(indentation) %>% diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R index ac08047e..e6d973f0 100644 --- a/R/set_limit_data_by.R +++ b/R/set_limit_data_by.R @@ -48,15 +48,16 @@ set_limit_data_by.desc_layer <- function(e, ...) { #' General function used to process the steps to pad levels in data, or limit to #' combinations available in the data itself #' -#' @param dat -#' @param treat_var -#' @param by -#' @param cols -#' @param target_var -#' @param limit -#' @param .fill -#' TODO: Figure out best way to pass the data limiting into this function, because it doesn't exist unless distinctly set. -complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list()) { +#' @param dat Input dataset +#' @param treat_var treat_var from tplyr_table +#' @param by by from tplyr_layer +#' @param cols cols from tplyr_table +#' @param target_var target_var from tplyr_layer +#' @param limit_data_by The variables to limit data by +#' @param .fill .fill parameter passed onto dplyr::complete +#' +#' @noRd +complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list(), outer=FALSE) { complete_levels <- dat %>% # complete all combinations of factors to include combinations that don't exist. @@ -66,6 +67,11 @@ complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limi # Apply data limits specified by setter if (!is.null(limit_data_by)) { + # Outer layer won't have the target variable to limit by + if (outer) { + limit_data_by <- limit_data_by[map_chr(limit_data_by, as_name) %in% names(dat)] + } + # Find the combinations actually in the data groups_in_data <- dat %>% distinct(!!!limit_data_by) diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R index bc79c52d..53a825a5 100644 --- a/tests/testthat/test-set_limit_data_by.R +++ b/tests/testthat/test-set_limit_data_by.R @@ -98,3 +98,32 @@ test_that("Descriptive statistics data limiting works properly", { }) +test_that("Nested count layers limit data accurately", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AEOUT, AEDECOD) + ) + + t_ae_df1 <- t_ae1 %>% + build() %>% select(-starts_with('ord')) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AESEV, AEOUT, AEDECOD) + ) + + t_ae_df2 <- t_ae2 %>% + build() %>% select(-starts_with('ord')) + + dropped_rows <- anti_join( + t_ae_df1, + t_ae_df2, + by=names(t_ae_df1) + ) + + check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) + expect_true(all(check == " 0 ( 0.0%)")) +}) From c9d46fddc672cd4d6afec717fcd4f5602c4a396f Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Tue, 6 Feb 2024 20:11:38 +0000 Subject: [PATCH 08/24] Add data limiting for shift layers --- NAMESPACE | 1 + R/set_limit_data_by.R | 6 ++ R/shift.R | 5 +- tests/testthat/test-set_limit_data_by.R | 95 ++++++++++++++++++------- 4 files changed, 78 insertions(+), 29 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fe7924b1..cd21ea81 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ S3method(set_format_strings,desc_layer) S3method(set_format_strings,shift_layer) S3method(set_limit_data_by,count_layer) S3method(set_limit_data_by,desc_layer) +S3method(set_limit_data_by,shift_layer) S3method(set_where,tplyr_layer) S3method(set_where,tplyr_table) S3method(str,f_str) diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R index e6d973f0..2d5eb649 100644 --- a/R/set_limit_data_by.R +++ b/R/set_limit_data_by.R @@ -28,6 +28,12 @@ set_limit_data_by.count_layer <- function(e, ...) { e } +#' @export +#' @noRd +set_limit_data_by.shift_layer <- function(e, ...) { + set_limit_data_by.count_layer(e, ...) +} + #' @export #' @noRd set_limit_data_by.desc_layer <- function(e, ...) { diff --git a/R/shift.R b/R/shift.R index a6a38d7b..c6a42d9f 100644 --- a/R/shift.R +++ b/R/shift.R @@ -39,7 +39,6 @@ process_summaries.shift_layer <- function(x, ...) { process_shift_n <- function(x) { evalq({ - numeric_data <- built_target %>% # Group by variables including target variables and count them group_by(!!treat_var, !!!by, !!!unname(target_var), !!!cols) %>% @@ -47,7 +46,9 @@ process_shift_n <- function(x) { ungroup() %>% # complete all combinations of factors to include combinations that don't exist. # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + # complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + complete_and_limit(treat_var, by, cols, unname(target_var), + limit_data_by, .fill = list(n = 0)) %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R index 53a825a5..28f0856e 100644 --- a/tests/testthat/test-set_limit_data_by.R +++ b/tests/testthat/test-set_limit_data_by.R @@ -1,31 +1,32 @@ library(dplyr) -adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, - "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, - "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, - "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, - "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, - "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, - "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, - "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, - "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, - "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, - "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, - "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, - "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, - - "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, - "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, - "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, - "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, - "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, - "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, - "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, - "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, - "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21 +adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, ~BASEC, + "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, "Abnormal", + "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, "Semi-Normal", + "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, "Normal", + "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, "Normal", + "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, "Normal", + "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, "Normal", + "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, "Normal", + "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, "Semi-Normal", + "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, "Normal", + "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, "Abnormal", + "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, "Abnormal", + "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, "Abnormal", + + "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, "Normal", + "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, "Normal", + "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, "Semi-Normal", + "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, "Normal", + "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, "Abnormal", + "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, "Abnormal", + "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, "Normal", + "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, "Normal", + "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21, "Normal" ) adpe$AVALC <- factor(adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) +adpe$BASEC <- factor(adpe$BASEC, levels = c("Normal", "Semi-Normal", "Abnormal")) test_that("Descriptive statistics data limiting works properly", { t1 <- tplyr_table(adpe, TRT01A) %>% @@ -62,11 +63,12 @@ test_that("Descriptive statistics data limiting works properly", { expect_equal(cnts3$n, c(6, 18, 18)) }) -test_that("Descriptive statistics data limiting works properly", { + +test_that("Shift layers can also handle data limiting", { t1 <- tplyr_table(adpe, TRT01A) %>% add_layer( - group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) ) x1 <- build(t1) @@ -76,7 +78,7 @@ test_that("Descriptive statistics data limiting works properly", { t2 <- tplyr_table(adpe, TRT01A) %>% add_layer( - group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PARAM, AVISIT) ) @@ -87,7 +89,7 @@ test_that("Descriptive statistics data limiting works properly", { t3 <- tplyr_table(adpe, TRT01A) %>% add_layer( - group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PECAT, PARAM, AVISIT) ) @@ -95,6 +97,7 @@ test_that("Descriptive statistics data limiting works properly", { cnts3 <- count(x3, row_label1, row_label2) expect_equal(cnts3$n, c(3, 9, 9)) + }) @@ -127,3 +130,41 @@ test_that("Nested count layers limit data accurately", { check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) expect_true(all(check == " 0 ( 0.0%)")) }) + + +test_that("Descriptive statistics data limiting works properly", { + + t1 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) +}) + + From f96c66bb420ae499a6687f429898f1a3cd466951 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Wed, 7 Feb 2024 15:26:35 +0000 Subject: [PATCH 09/24] Add documentation and convert adpe to a built-in dataset. --- R/data.R | 8 ++ R/set_limit_data_by.R | 46 ++++++++- data-raw/adpe.R | 31 ++++++ data/tplyr_adpe.rda | Bin 0 -> 500 bytes man/set_limit_data_by.Rd | 46 ++++++++- man/tplyr_adpe.Rd | 16 +++ tests/testthat/test-set_limit_data_by.R | 128 +++++++++--------------- tests/testthat/test-sort.R | 4 +- vignettes/table.Rmd | 52 ++++++++++ 9 files changed, 242 insertions(+), 89 deletions(-) create mode 100644 data-raw/adpe.R create mode 100644 data/tplyr_adpe.rda create mode 100644 man/tplyr_adpe.Rd diff --git a/R/data.R b/R/data.R index 25332b39..2627ef3f 100644 --- a/R/data.R +++ b/R/data.R @@ -47,6 +47,14 @@ #' "tplyr_adlb" +#' ADPE Data +#' +#' A mock-up dataset that is fit for testing data limiting +#' +#' @format A data.frame with 21 rows and 8 columns. +#' +#' +"tplyr_adpe" #' Get Data Labels #' diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R index 2d5eb649..321c3366 100644 --- a/R/set_limit_data_by.R +++ b/R/set_limit_data_by.R @@ -1,11 +1,50 @@ -#' Set variables to limit to data values only rather than fully completing all -#' possible levels +#' Set variables to limit reported data values only to those that exist rather +#' than fully completing all possible levels +#' +#' This function allows you to select a combination of by variables or +#' potentially target variables for which you only want to display values +#' present in the data. By default, Tplyr will create a cartesian combination of +#' potential values of the data. For example, if you have 2 by variables +#' present, then each potential combination of those by variables will have a +#' row present in the final table. `set_limit_data_by()` allows you to choose +#' the by variables whose combination you wish to limit to values physically +#' present in the available data. #' #' @param e A tplyr_layer #' @param ... Subset of variables within by or target variables #' -#' @return +#' @return a tplyr_table +#' @md #' @export +#' +#' @examples +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PECAT, PARAM, AVISIT) +#' ) %>% +#' build() set_limit_data_by <- function(e, ...) { UseMethod("set_limit_data_by") } @@ -61,6 +100,7 @@ set_limit_data_by.desc_layer <- function(e, ...) { #' @param target_var target_var from tplyr_layer #' @param limit_data_by The variables to limit data by #' @param .fill .fill parameter passed onto dplyr::complete +#' @param outer Whether to bypass variables if working through the outer layer #' #' @noRd complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list(), outer=FALSE) { diff --git a/data-raw/adpe.R b/data-raw/adpe.R new file mode 100644 index 00000000..d80860eb --- /dev/null +++ b/data-raw/adpe.R @@ -0,0 +1,31 @@ +# This adpe dataset is just a mock-up that's fit for purpose to test and demonstrate data limiting +tplyr_adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, ~BASEC, + "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, "Abnormal", + "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, "Semi-Normal", + "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, "Normal", + "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, "Normal", + "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, "Normal", + "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, "Normal", + "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, "Normal", + "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, "Semi-Normal", + "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, "Normal", + "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, "Abnormal", + "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, "Abnormal", + "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, "Abnormal", + + "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, "Normal", + "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, "Normal", + "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, "Semi-Normal", + "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, "Normal", + "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, "Abnormal", + "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, "Abnormal", + "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, "Normal", + "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, "Normal", + "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21, "Normal" +) + +tplyr_adpe$AVALC <- factor(tplyr_adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$BASEC <- factor(tplyr_adpe$BASEC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$AVISIT <- factor(tplyr_adpe$AVISIT, levels = c("Screening", "Day -1", "Day 5")) + +usethis::use_data(tplyr_adpe, overwrite = TRUE) diff --git a/data/tplyr_adpe.rda b/data/tplyr_adpe.rda new file mode 100644 index 0000000000000000000000000000000000000000..58557959add5372807df04914e4a4e2765568d51 GIT binary patch literal 500 zcmV@T4*^jL0KkKS)a8~YXAd}f5HFzPDE8#egOV=UqHX7-=IJM00=+x=`;bMiRlduqz_D{f$Dy!MLd)MGy&=~(@g+00B9N-F&Z*59AwF% zh{Vx=Mgm~~U?!S0$j}TVN~fs(MAJr14Ky?xP-&11KmZ1S$@)q(DO0IdvFq4LJ4#zt z<@?7u?uiNjVG%I_AOIi&1_CIXtQ_c*njBmT9CDdLM5Ypip-53p=)^%MngIC4juc9Pk2dXYU#eG-_9Im8#_i3ve$j8+B=*>S)^R)dBp>^nO<{CnDZ z_zWmuS?v3HA*&TsY_hjrN#ydE#38;^NS7xFq;X{S>L)ycl%iE$dLEul9(}ydhqv}L ze+&95DlnMr^o^Cqy}zc*sWMPw$7}WOC|)60e_%sx6yNo*!6+{5Q(FHC@z|lOZx+@pmLsg$W7!RR*vHG}JW! literal 0 HcmV?d00001 diff --git a/man/set_limit_data_by.Rd b/man/set_limit_data_by.Rd index 49203e21..c9416a76 100644 --- a/man/set_limit_data_by.Rd +++ b/man/set_limit_data_by.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/set_limit_data_by.R \name{set_limit_data_by} \alias{set_limit_data_by} -\title{Set variables to limit to data values only rather than fully completing all -possible levels} +\title{Set variables to limit reported data values only to those that exist rather +than fully completing all possible levels} \usage{ set_limit_data_by(e, ...) } @@ -12,7 +12,45 @@ set_limit_data_by(e, ...) \item{...}{Subset of variables within by or target variables} } +\value{ +a tplyr_table +} \description{ -Set variables to limit to data values only rather than fully completing all -possible levels +This function allows you to select a combination of by variables or +potentially target variables for which you only want to display values +present in the data. By default, Tplyr will create a cartesian combination of +potential values of the data. For example, if you have 2 by variables +present, then each potential combination of those by variables will have a +row present in the final table. \code{set_limit_data_by()} allows you to choose +the by variables whose combination you wish to limit to values physically +present in the available data. +} +\examples{ + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) \%>\% + build() } diff --git a/man/tplyr_adpe.Rd b/man/tplyr_adpe.Rd new file mode 100644 index 00000000..ab19f95d --- /dev/null +++ b/man/tplyr_adpe.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adpe} +\alias{tplyr_adpe} +\title{ADPE Data} +\format{ +A data.frame with 21 rows and 8 columns. +} +\usage{ +tplyr_adpe +} +\description{ +A mock-up dataset that is fit for testing data limiting +} +\keyword{datasets} diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R index 28f0856e..1f0e2bae 100644 --- a/tests/testthat/test-set_limit_data_by.R +++ b/tests/testthat/test-set_limit_data_by.R @@ -1,45 +1,15 @@ -library(dplyr) - -adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, ~BASEC, - "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, "Abnormal", - "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, "Semi-Normal", - "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, "Normal", - "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, "Normal", - "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, "Normal", - "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, "Normal", - "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, "Normal", - "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, "Semi-Normal", - "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, "Normal", - "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, "Abnormal", - "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, "Abnormal", - "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, "Abnormal", - - "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, "Normal", - "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, "Normal", - "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, "Semi-Normal", - "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, "Normal", - "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, "Abnormal", - "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, "Abnormal", - "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, "Normal", - "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, "Normal", - "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21, "Normal" -) - -adpe$AVALC <- factor(adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) -adpe$BASEC <- factor(adpe$BASEC, levels = c("Normal", "Semi-Normal", "Abnormal")) - test_that("Descriptive statistics data limiting works properly", { - t1 <- tplyr_table(adpe, TRT01A) %>% + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) ) x1 <- build(t1) - cnts1 <- count(x1, row_label1, row_label2) + cnts1 <- dplyr::count(x1, row_label1, row_label2) expect_equal(cnts1$n, c(18, 18, 18, 18)) - t2 <- tplyr_table(adpe, TRT01A) %>% + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PARAM, AVISIT) @@ -47,11 +17,11 @@ test_that("Descriptive statistics data limiting works properly", { x2 <- build(t2) - cnts2 <- count(x2, row_label1, row_label2) + cnts2 <- dplyr::count(x2, row_label1, row_label2) expect_equal(cnts2$n, c(6, 18, 6, 18)) - t3 <- tplyr_table(adpe, TRT01A) %>% + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PECAT, PARAM, AVISIT) @@ -59,24 +29,24 @@ test_that("Descriptive statistics data limiting works properly", { x3 <- build(t3) - cnts3 <- count(x3, row_label1, row_label2) + cnts3 <- dplyr::count(x3, row_label1, row_label2) expect_equal(cnts3$n, c(6, 18, 18)) }) test_that("Shift layers can also handle data limiting", { - t1 <- tplyr_table(adpe, TRT01A) %>% + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) ) x1 <- build(t1) - cnts1 <- count(x1, row_label1, row_label2) + cnts1 <- dplyr::count(x1, row_label1, row_label2) expect_equal(cnts1$n, c(9, 9, 9, 9)) - t2 <- tplyr_table(adpe, TRT01A) %>% + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PARAM, AVISIT) @@ -84,10 +54,10 @@ test_that("Shift layers can also handle data limiting", { x2 <- build(t2) - cnts2 <- count(x2, row_label1, row_label2) + cnts2 <- dplyr::count(x2, row_label1, row_label2) expect_equal(cnts2$n, c(3, 9, 3, 9)) - t3 <- tplyr_table(adpe, TRT01A) %>% + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PECAT, PARAM, AVISIT) @@ -95,56 +65,25 @@ test_that("Shift layers can also handle data limiting", { x3 <- build(t3) - cnts3 <- count(x3, row_label1, row_label2) + cnts3 <- dplyr::count(x3, row_label1, row_label2) expect_equal(cnts3$n, c(3, 9, 9)) }) -test_that("Nested count layers limit data accurately", { - - t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% - add_layer( - group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% - set_limit_data_by(AEOUT, AEDECOD) - ) - - t_ae_df1 <- t_ae1 %>% - build() %>% select(-starts_with('ord')) - - t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% - add_layer( - group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% - set_limit_data_by(AESEV, AEOUT, AEDECOD) - ) - - t_ae_df2 <- t_ae2 %>% - build() %>% select(-starts_with('ord')) - - dropped_rows <- anti_join( - t_ae_df1, - t_ae_df2, - by=names(t_ae_df1) - ) - - check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) - expect_true(all(check == " 0 ( 0.0%)")) -}) - - -test_that("Descriptive statistics data limiting works properly", { +test_that("Count data limiting works properly", { - t1 <- tplyr_table(adpe, TRT01A) %>% + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) ) x1 <- build(t1) - cnts1 <- count(x1, row_label1, row_label2) + cnts1 <- dplyr::count(x1, row_label1, row_label2) expect_equal(cnts1$n, c(9, 9, 9, 9)) - t2 <- tplyr_table(adpe, TRT01A) %>% + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PARAM, AVISIT) @@ -152,10 +91,10 @@ test_that("Descriptive statistics data limiting works properly", { x2 <- build(t2) - cnts2 <- count(x2, row_label1, row_label2) + cnts2 <- dplyr::count(x2, row_label1, row_label2) expect_equal(cnts2$n, c(3, 9, 3, 9)) - t3 <- tplyr_table(adpe, TRT01A) %>% + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% add_layer( group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% set_limit_data_by(PECAT, PARAM, AVISIT) @@ -163,8 +102,37 @@ test_that("Descriptive statistics data limiting works properly", { x3 <- build(t3) - cnts3 <- count(x3, row_label1, row_label2) + cnts3 <- dplyr::count(x3, row_label1, row_label2) expect_equal(cnts3$n, c(3, 9, 9)) }) +test_that("Nested count layers limit data accurately", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AEOUT, AEDECOD) + ) + + t_ae_df1 <- t_ae1 %>% + build() %>% select(-starts_with('ord')) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AESEV, AEOUT, AEDECOD) + ) + + t_ae_df2 <- t_ae2 %>% + build() %>% select(-starts_with('ord')) + + dropped_rows <- anti_join( + t_ae_df1, + t_ae_df2, + by=names(t_ae_df1) + ) + + check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) + expect_true(all(check == " 0 ( 0.0%)")) +}) diff --git a/tests/testthat/test-sort.R b/tests/testthat/test-sort.R index e0236cc0..e32fa740 100644 --- a/tests/testthat/test-sort.R +++ b/tests/testthat/test-sort.R @@ -283,7 +283,7 @@ test_that("Nested counts with by variables process properly", { # This is verifying that the right number of combinations of row_labels exist, and that # there aren't duplicate order values for the outer layer - expect_equal(nrow(count(t_ae_df1, row_label2, row_label3, ord_layer_3)), 6) + expect_equal(nrow(dplyr::count(t_ae_df1, row_label2, row_label3, ord_layer_3)), 6) t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( @@ -297,6 +297,6 @@ test_that("Nested counts with by variables process properly", { build() # Same test but now working with a text outer layer and one by variable - expect_equal(nrow(count(t_ae_df2, row_label2, ord_layer_2)), 2) + expect_equal(nrow(dplyr::count(t_ae_df2, row_label2, ord_layer_2)), 2) }) diff --git a/vignettes/table.Rmd b/vignettes/table.Rmd index ae9c9a82..bad21d72 100644 --- a/vignettes/table.Rmd +++ b/vignettes/table.Rmd @@ -115,6 +115,58 @@ header_n(t) %>% Note: it’s expected the `set_distinct_by()` function is used with population data. This is because it does not make sense to use population data denominators unless you have distinct counts. The entire point of population data is to use subject counts, so non-distinct counts would potentially count multiple records per subject and then the percentage doesn’t make any sense. +## Data Completion + +When creating summary tables, often we have to mock up the potential values of data, even if those values aren't present in the data we're summarizing. **Tplyr** does its best effort to do this for you. Let's consider the following dataset: + +```{r data_comp, echo=FALSE} +kable(head(tplyr_adpe)) +``` +Let's say we want to create a count summary for this dataset, and report it by PARAM and AVISIT. Note that in the data, `PARAM=="HEAD"` is only collected at screening, while `LUNGS` is collected at Screening, Day -1, and Day 5. + +```{r data_comp1} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(18) %>% + kable() + +``` + +By default, given the `by` variables of PARAM and AVISIT, all of the potential visits have dummy rows created that are 0 filled - meaning results of 0 records for all treatment groups are presented. However, that might not be what you wish to present. Perhaps `HEAD` was only intended to be collected at the Screening visit so it's unnecessary to present other visits. To address this, you can use the `set_limit_data_by()` function. + +```{r data_comp2} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(12) %>% + kable() +``` + +Here you can see that now records for `HEAD` only present the screening visit. For count and shift layers, you can additionally dig further in to use target variables: + +```{r data_comp3} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT, AVALC) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + kable() +``` + +This effectively limits to the values present in the data itself. + +## Where to Go From Here + With the table level settings under control, now you're ready to learn more about what **Tplyr** has to offer in each layer. - Learn more about descriptive statistics layers in `vignette("desc")` From d741739906d8b281d28b501131c2cde66f204171 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 13:48:49 +0000 Subject: [PATCH 10/24] Push development for missing subs --- R/count.R | 77 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/R/count.R b/R/count.R index 030db51d..0311fe68 100644 --- a/R/count.R +++ b/R/count.R @@ -129,6 +129,8 @@ process_single_count_target <- function(x) { if (is.null(include_total_row)) include_total_row <- FALSE if (is.null(total_row_label)) total_row_label <- "Total" + if (is.null(include_missing_subjects_row)) include_missing_subjects_row <- FALSE + if (is.null(missing_subjects_row_label)) missing_subjects_row_label <- "Missing" # The current environment should be the layer itself process_count_n(current_env()) @@ -150,6 +152,10 @@ process_single_count_target <- function(x) { } } + if (include_missing_subjects_row) { + process_missing_subjects_row(current_env()) + } + if (is.null(count_row_prefix)) count_row_prefix <- "" # If a denoms variable is factor then it should be character for the denoms calculations @@ -186,17 +192,16 @@ process_single_count_target <- function(x) { } # rbind tables together - numeric_data <- summary_stat %>% - bind_rows(total_stat) %>% + numeric_data <- bind_rows(summary_stat, total_stat, missing_subjects_stat) %>% rename("summary_var" = !!target_var[[1]]) %>% group_by(!!!denoms_by) %>% do(get_denom_total(., denoms_by, denoms_df_prep, "n")) %>% mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>% ungroup() - if (!is.null(distinct_stat)) { if (include_total_row) { + distinct_stat <- distinct_stat %>% bind_rows(total_stat_denom) %>% group_by(!!!denoms_by) %>% @@ -275,6 +280,25 @@ process_count_n <- function(x) { } + +#' Get Logical vector that is used to remove the treat_var and cols +#' +#' In total row and missing subject counts, denoms_by needs to be stripped of +#' cols and treat_var variables, otherwise it will error out in the group_by +#' +#' @param denoms_by The layer denoms by +#' @param treat_var table treat var +#' @param cols tables cols vars +#' +#' @return list of quosures +#' @noRd +get_needed_denoms_by <- function(denoms_by, treat_var, cols) { + map_lgl(denoms_by, function(x, treat_var, cols) { + all(as_name(x) != as_name(treat_var), + as_name(x) != map_chr(cols, as_name)) + }, treat_var, cols) +} + #' Process the amounts for a total row #' #' @param x A Count layer @@ -290,16 +314,8 @@ no denoms_by variable was set. This may cause unexpected results. If you wish to change this behavior, use `set_denoms_by()`.", immediate. = TRUE) } - # Make sure the denoms_by is stripped - # Stripped of cols and treat_var variables, otherwise it will error out in the group_by - # I thought of replacing the group by with !!!unique(c(treat_var, cols, denoms_by)) - # but that doesn't work due to the denoms_by having an environment set - # Logical vector that is used to remove the treat_var and cols - needed_denoms_by <- map_lgl(denoms_by, function(x, treat_var, cols) { - all(as_name(x) != as_name(treat_var), - as_name(x) != map_chr(cols, as_name)) - }, treat_var, cols) + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) #Create an expression to evaluate filter if (!count_missings) { @@ -332,6 +348,43 @@ change this behavior, use `set_denoms_by()`.", immediate. = TRUE) }, envir = x) } +#' Process the amounts for a missing subjects row +#' +#' @param x A Count layer +#' @noRd +process_missing_subjects_row <- function(x) { + evalq({ + + # Logical vector that is used to remove the treat_var and cols + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) + + # Create the merge variables to join the header_n data + mrg_vars <- map_chr(c(pop_treat_var, cols, denoms_by[needed_denoms_by]), as_name) + names(mrg_vars)[1] <- as_name(treat_var) + + # create a data.frame to create total counts + missing_subjects_stat <- built_target %>% + # Use distinct if this is a distinct total row + # Group by all column variables + group_by(!!treat_var, !!!cols, !!!by, !!!distinct_by) %>% + distinct() %>% + ungroup() %>% + count(!!treat_var, !!!cols, !!!by, name="distinct_n") %>% + left_join( + header_n %>% rename(distinct_total = n), by = mrg_vars + ) %>% + # Create a variable to label the totals when it is merged in. + mutate(!!as_name(target_var[[1]]) := missing_subjects_row_label) %>% + # Create variables to carry forward 'by'. Only pull out the ones that + # aren't symbols + group_by(!!!extract_character_from_quo(by)) %>% + # ungroup right away to make sure the complete works + ungroup() %>% + # complete based on missing groupings + complete(!!treat_var, !!!cols, fill = list(n = 0, total = 0)) + }, envir = x) +} + #' Prepare metadata for table #' #' @param x count_layer object From 208c13c3ad18357a072ceeae19d38b9e495ce1a6 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 13:49:16 +0000 Subject: [PATCH 11/24] Unreachable code - this is legacy from denom refactor --- R/count.R | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/R/count.R b/R/count.R index 0311fe68..75f9115a 100644 --- a/R/count.R +++ b/R/count.R @@ -199,18 +199,6 @@ process_single_count_target <- function(x) { mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>% ungroup() - if (!is.null(distinct_stat)) { - if (include_total_row) { - - distinct_stat <- distinct_stat %>% - bind_rows(total_stat_denom) %>% - group_by(!!!denoms_by) %>% - do(get_denom_total(., denoms_by, denoms_df, "distinct_n")) - } - numeric_data <- bind_cols(numeric_data, - distinct_stat[, c("distinct_n", "distinct_total")]) - } - rm(denoms_df_prep, fct_cols) }, envir = x) From 111d90d87d071ac66760c7287876e3d105e81a32 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 13:49:48 +0000 Subject: [PATCH 12/24] Bindings for add missing subs --- R/count_bindings.R | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/R/count_bindings.R b/R/count_bindings.R index 197c6757..a618fd9f 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -711,3 +711,67 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { e } + +#' Add a missing subject row into a count summary. +#' +#' +#' @param e A \code{count_layer} object +#' @param fmt An f_str object used to format the total row. If none is provided, +#' display is based on the layer formatting. +#' @param sort_value The value that will appear in the ordering column for total +#' rows. This must be a numeric value. +#' +#' @export +#' @examples +#' +#' tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row(f_str("xxxx", n)) +#' ) %>% +#' build() +add_missing_subjects_row <- function(e, fmt = NULL, sort_value = NULL) { + if(!is.null(fmt)) assert_inherits_class(fmt, "f_str") + if(!is.null(sort_value)) assert_inherits_class(sort_value, "numeric") + if("shift_layer" %in% class(e)) { + rlang::abort("`add_missing_subjects_row` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + env_bind(e, include_missing_subjects_row = TRUE) + env_bind(e, missing_subjects_count_format = fmt) + env_bind(e, missing_subjects_sort_value = sort_value) + + e +} + +#' Set the label for the missing subjects row +#' +#' @param e A \code{count_layer} object +#' @param total_row_label A character to label the total row +#' +#' @return The modified \code{count_layer} object +#' @export +#' +#' @examples +#' +#' t <- tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row() %>% +#' set_missing_subjects_label("Missing") +#' ) +#' build(t) +set_missing_subjects_row_label <- function(e, missing_subjects_row_label) { + + assert_has_class(missing_subjects_row_label, "character") + assert_that(length(missing_subjects_row_label) == 1) + if("shift_layer" %in% class(e)) { + rlang::abort("`missing_subjects_row_label` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + env_bind(e, missing_subjects_row_label = missing_subjects_row_label) + + e +} From 0f5b1467e11696bb686164ca1b76959709aa5dd2 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 14:54:02 +0000 Subject: [PATCH 13/24] push development --- NAMESPACE | 2 ++ R/count.R | 40 +++++++++++++++++++++------ R/sort.R | 27 +++++++++++++----- R/zzz.R | 6 ++-- man/add_missing_subjects_row.Rd | 29 +++++++++++++++++++ man/set_missing_subjects_row_label.Rd | 29 +++++++++++++++++++ 6 files changed, 115 insertions(+), 18 deletions(-) create mode 100644 man/add_missing_subjects_row.Rd create mode 100644 man/set_missing_subjects_row_label.Rd diff --git a/NAMESPACE b/NAMESPACE index d2b779eb..8566d406 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,7 @@ export(add_column_headers) export(add_filters) export(add_layer) export(add_layers) +export(add_missing_subjects_row) export(add_risk_diff) export(add_total_group) export(add_total_row) @@ -102,6 +103,7 @@ export(set_format_strings) export(set_header_n) export(set_indentation) export(set_missing_count) +export(set_missing_subjects_row_label) export(set_nest_count) export(set_numeric_threshold) export(set_order_count_method) diff --git a/R/count.R b/R/count.R index 75f9115a..6d862b2a 100644 --- a/R/count.R +++ b/R/count.R @@ -459,6 +459,7 @@ process_formatting.count_layer <- function(x, ...) { summary_var = summary_var, indentation_length = indentation_length, total_count_format = total_count_format, + missing_subjects_count_format = missing_subjects_count_format, total_row_label = total_row_label, has_missing_count = has_missing_count) }) %>% @@ -526,7 +527,7 @@ process_formatting.count_layer <- function(x, ...) { construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_total = NULL, count_fmt = NULL, max_layer_length, max_n_width, missing_string, missing_f_str, summary_var, indentation_length, total_count_format, - total_row_label, has_missing_count) { + missing_subjects_count_format, total_row_label, has_missing_count) { ## Added this for processing formatting in nested count layers where this won't be processed yet if (is.null(max_layer_length)) max_layer_length <- 0 @@ -556,6 +557,12 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord <- map_chr(total_count_format$vars, as_name) } + ## Pull out string information for missing subject rows + if (!is.null(missing_subjects_count_format)) { + missing_subject_rows <- summary_var %in% missing_subjects_row_label + missing_subject_vars_ord <- map_chr(missing_subjects_count_format$vars, as_name) + } + vars_ord <- map_chr(count_fmt$vars, as_name) # str_all is a list that contains character vectors for each parameter that might be calculated @@ -563,9 +570,14 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot # Append the repl_str to be passed to do.call str_all[1] <- count_fmt$repl_str # Iterate over every variable + rows_ <- !missing_rows & !total_rows & !missing_subject_rows for (i in seq_along(vars_ord)) { - str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, .n[!missing_rows & !total_rows], .total[!missing_rows & !total_rows], - .distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord) + str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, + .n[rows_], + .total[rows_], + .distinct_n[rows_], + .distinct_total[rows_], + vars_ord) } @@ -595,20 +607,32 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord) } + missing_subs_str_all <- vector("list", 5) + missing_subs_str_all[1] <- missing_subjects_count_format$repl_str + for (i in seq_along(missing_subject_vars_ord)) { + missing_subs_str_all[[i + 1]] <- count_string_switch_help(missing_subject_vars_ord[i], + missing_subjects_count_format, + .n[missing_subject_rows], + .total[missing_subject_rows], + .distinct_n[missing_subject_rows], + .distinct_total[missing_subject_rows], + missing_subject_vars_ord) + } + # Put the vector strings together. Only include parts of str_all that aren't null - # nm is non-missing, m is mising, and t is total. + # nm is non-missing, m is missing, t is total, ms is missing subjects string_nm <- do.call(sprintf, str_all[!map_lgl(str_all, is.null)]) if (!is.null(missing_vars_ord)) string_m <- do.call(sprintf, missing_str_all[!map_lgl(missing_str_all, is.null)]) if (!is.null(total_vars_ord)) string_t <- do.call(sprintf, total_str_all[!map_lgl(total_str_all, is.null)]) + if (!is.null(missing_subject_vars_ord)) string_ms <- do.call(sprintf, missing_subs_str_all[!map_lgl(missing_subs_str_all, is.null)]) # string_ is the final string to return. Merge the missing, non-missing, and others together string_ <- character(length(string_nm) + length(string_m) + length(string_t)) - string_[!missing_rows & !total_rows] <- string_nm + string_[rows_] <- string_nm string_[total_rows] <- string_t string_[missing_rows] <- string_m + string_[missing_subject_rows] <- string_ms - - - + browser() # Left pad set to 0 meaning it won't pad to the left at all # right pad is set to the maximum n count in the table string_ <- pad_formatted_data(string_, 0, max_n_width) diff --git a/R/sort.R b/R/sort.R index 1409f713..2d84b0be 100644 --- a/R/sort.R +++ b/R/sort.R @@ -352,7 +352,8 @@ add_order_columns.shift_layer <- function(x) { # The logic is the same now for a byvarn so reuse that function formatted_data[, paste0("ord_layer_", formatted_col_index)] <- get_data_order_byvarn(formatted_data, fact_df, as_name(target_var$row), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) rm(formatted_col_index) @@ -432,7 +433,7 @@ get_data_order <- function(x, formatted_col_index) { get_data_order_bycount(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index, missing_sort_value, - total_index, total_row_sort_value, + total_index, total_row_sort_value, missing_subjects_sort_value, break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, @@ -466,7 +467,8 @@ get_data_order <- function(x, formatted_col_index) { } get_data_order_byvarn(formatted_data, varn_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) # Here it is 'byfactor' @@ -521,7 +523,8 @@ get_data_order <- function(x, formatted_col_index) { # The logic is the same now for a byvarn so reuse that function get_data_order_byvarn(formatted_data, fact_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) } }, envir = x) } @@ -532,6 +535,7 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index = NULL, missing_sort_value = NULL, total_index = NULL, total_row_sort_value = NULL, + missing_subjects_sort_value = NULL, break_ties, numeric_cutoff, numeric_cutoff_stat, numeric_cutoff_column, nested = FALSE) { @@ -606,6 +610,10 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, numeric_ordering_data[total_index,] <- total_row_sort_value } + if(!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) { + numeric_ordering_data[missing_subjects_sort_value,] <- missing_subjects_sort_value + } + # This is the numeric index that the numeric data is in. radix was chosen because # its the only method that gives indicies as far as I can tell # x are the values @@ -632,7 +640,8 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, } get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_index, - indentation = "", total_row_sort_value = NULL) { + indentation = "", total_row_sort_value = NULL, + missing_subjects_sort_value = NULL) { # Pull out the by values in the formatted data. by_values <- unlist(formatted_data[, by_column_index]) @@ -650,6 +659,8 @@ get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_ # Flag to determine where total row is positioned if(!is.null(total_row_sort_value)) { total_row_sort_value + } else if (!is.null(missing_subjects_sort_value)){ + missing_subjects_sort_value } else { max(by_varn_df[,2]) + 1 } @@ -699,7 +710,8 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { varn_df <- get_varn_values(target, as_name(by[[1]])) all_outer$..index <- group_data[1,] %>% - get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value) + get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% filter(summary_var == outer_value) %>% @@ -767,7 +779,8 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { target_var[[1]], length(by) + 1, indentation, - total_row_sort_value = total_row_sort_value) + total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) } else { diff --git a/R/zzz.R b/R/zzz.R index 70df2747..b06b2444 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -199,7 +199,6 @@ display_string <- NULL built_target <- NULL table_where <- NULL distinct_by <- NULL -distinct_stat <- NULL summary_vars <- NULL trans_vars <- NULL stat <- NULL @@ -277,7 +276,6 @@ missing_sort_value <- NULL missing_index <- NULL total_index <- NULL process_distinct_total <- FALSE -total_stat_denom <- NULL denom_where <- NULL built_target_pre_where <- NULL count_fmt <- NULL @@ -307,4 +305,6 @@ og_row <- NULL desc <- NULL id <- NULL stub_sort <- NULL - +include_missing_subjects_row <- NULL +missing_subjects_row_label <- NULL +missing_subjects_stat <- NULL diff --git a/man/add_missing_subjects_row.Rd b/man/add_missing_subjects_row.Rd new file mode 100644 index 00000000..cfb873d7 --- /dev/null +++ b/man/add_missing_subjects_row.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{add_missing_subjects_row} +\alias{add_missing_subjects_row} +\title{Add a missing subject row into a count summary.} +\usage{ +add_missing_subjects_row(e, fmt = NULL, sort_value = NULL) +} +\arguments{ +\item{e}{A \code{count_layer} object} + +\item{fmt}{An f_str object used to format the total row. If none is provided, +display is based on the layer formatting.} + +\item{sort_value}{The value that will appear in the ordering column for total +rows. This must be a numeric value.} +} +\description{ +Add a missing subject row into a count summary. +} +\examples{ + +tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row(f_str("xxxx", n)) + ) \%>\% + build() +} diff --git a/man/set_missing_subjects_row_label.Rd b/man/set_missing_subjects_row_label.Rd new file mode 100644 index 00000000..ed75af9e --- /dev/null +++ b/man/set_missing_subjects_row_label.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{set_missing_subjects_row_label} +\alias{set_missing_subjects_row_label} +\title{Set the label for the missing subjects row} +\usage{ +set_missing_subjects_row_label(e, missing_subjects_row_label) +} +\arguments{ +\item{e}{A \code{count_layer} object} + +\item{total_row_label}{A character to label the total row} +} +\value{ +The modified \code{count_layer} object +} +\description{ +Set the label for the missing subjects row +} +\examples{ + +t <- tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row() \%>\% + set_missing_subjects_label("Missing") + ) +build(t) +} From 0bb5023a4ef6ee58d49cab676997b56f0b1a31f9 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 15:12:32 +0000 Subject: [PATCH 14/24] save so I can compare --- R/count.R | 2 ++ R/zzz.R | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/R/count.R b/R/count.R index 6d862b2a..93898cb4 100644 --- a/R/count.R +++ b/R/count.R @@ -534,6 +534,7 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot if (is.null(max_n_width)) max_n_width <- 0 missing_rows <- FALSE total_rows <- FALSE + missing_subject_rows <- FALSE # Add in the missing format if its null and there are missing counts if (has_missing_count && is.null(missing_f_str)) { @@ -621,6 +622,7 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot # Put the vector strings together. Only include parts of str_all that aren't null # nm is non-missing, m is missing, t is total, ms is missing subjects + browser() string_nm <- do.call(sprintf, str_all[!map_lgl(str_all, is.null)]) if (!is.null(missing_vars_ord)) string_m <- do.call(sprintf, missing_str_all[!map_lgl(missing_str_all, is.null)]) if (!is.null(total_vars_ord)) string_t <- do.call(sprintf, total_str_all[!map_lgl(total_str_all, is.null)]) diff --git a/R/zzz.R b/R/zzz.R index b06b2444..6fb9f413 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -308,3 +308,7 @@ stub_sort <- NULL include_missing_subjects_row <- NULL missing_subjects_row_label <- NULL missing_subjects_stat <- NULL +missing_subjects_count_format <- NULL +missing_subject_rows <- NULL +missing_subject_vars_ord <- NULL +string_ms <- NULL From c9904b73a5b1ac181cef45a4b32d502f921cbe82 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 21:26:18 +0000 Subject: [PATCH 15/24] namespace --- NAMESPACE | 1 + R/zzz.R | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 8566d406..233873fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -141,6 +141,7 @@ importFrom(dplyr,between) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,count) importFrom(dplyr,cur_column) importFrom(dplyr,cur_group) importFrom(dplyr,desc) diff --git a/R/zzz.R b/R/zzz.R index 6fb9f413..8fbb1823 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -11,7 +11,7 @@ #' @importFrom stats IQR median sd quantile var #' @importFrom dplyr n summarize filter vars tally ungroup group_by mutate lag select bind_rows full_join add_tally distinct rowwise #' @importFrom dplyr everything rename mutate_at mutate_all as_tibble bind_cols do case_when arrange left_join row_number between mutate_if -#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches slice_head where desc +#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches slice_head where desc count #' @importFrom tidyr complete nesting pivot_wider pivot_longer replace_na starts_with fill #' @importFrom utils str head tail #' @importFrom tidyselect all_of vars_select any_of @@ -312,3 +312,4 @@ missing_subjects_count_format <- NULL missing_subject_rows <- NULL missing_subject_vars_ord <- NULL string_ms <- NULL +missing_subjects_sort_value <- NULL From 963aeb5c11da0d91d578ff2922cc51cb24e64544 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 21:26:51 +0000 Subject: [PATCH 16/24] I don't actually see what changed here? --- R/utils.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index 1187d100..e38ab18f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -314,8 +314,8 @@ ut_round <- function(x, n=0) { # x is the value to be rounded # n is the precision of the rounding - posneg <- sign(x) - e <- abs(x) * 10^n + posneg <- sign(x) + e <- abs(x) * 10^n e <- e + 0.5 + sqrt(.Machine$double.eps) e <- trunc(e) e <- e / 10^n From aeb9843c415093fdafa78200d2bcd181f5dd203a Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 21:27:17 +0000 Subject: [PATCH 17/24] iron out nuance and sorting for subject counts --- R/count.R | 50 +++++++++++++++++++++++++++----------------------- R/nested.R | 19 ++++++++++++++----- R/sort.R | 13 ++++++++++--- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/R/count.R b/R/count.R index 93898cb4..5726dc68 100644 --- a/R/count.R +++ b/R/count.R @@ -39,6 +39,13 @@ process_summaries.count_layer <- function(x, ...) { sep = " "))) } + # Do this here to make sure that defaults are available everywhere else + # Downstream + if (is.null(include_total_row)) include_total_row <- FALSE + if (is.null(total_row_label)) total_row_label <- "Total" + if (is.null(include_missing_subjects_row)) include_missing_subjects_row <- FALSE + if (is.null(missing_subjects_row_label)) missing_subjects_row_label <- "Missing" + # Save this for the denominator where, but only if it hasn't been saved yet. if (is.null(built_target_pre_where)) built_target_pre_where <- built_target @@ -127,11 +134,6 @@ process_summaries.count_layer <- function(x, ...) { process_single_count_target <- function(x) { evalq({ - if (is.null(include_total_row)) include_total_row <- FALSE - if (is.null(total_row_label)) total_row_label <- "Total" - if (is.null(include_missing_subjects_row)) include_missing_subjects_row <- FALSE - if (is.null(missing_subjects_row_label)) missing_subjects_row_label <- "Missing" - # The current environment should be the layer itself process_count_n(current_env()) @@ -330,9 +332,7 @@ change this behavior, use `set_denoms_by()`.", immediate. = TRUE) # aren't symbols group_by(!!!extract_character_from_quo(by)) %>% # ungroup right away to make sure the complete works - ungroup() %>% - # complete based on missing groupings - complete(!!treat_var, !!!cols, fill = list(n = 0, total = 0)) + ungroup() }, envir = x) } @@ -349,27 +349,28 @@ process_missing_subjects_row <- function(x) { # Create the merge variables to join the header_n data mrg_vars <- map_chr(c(pop_treat_var, cols, denoms_by[needed_denoms_by]), as_name) names(mrg_vars)[1] <- as_name(treat_var) - # create a data.frame to create total counts missing_subjects_stat <- built_target %>% # Use distinct if this is a distinct total row # Group by all column variables - group_by(!!treat_var, !!!cols, !!!by, !!!distinct_by) %>% - distinct() %>% + distinct(!!treat_var, !!!cols, !!!by, !!!distinct_by) %>% ungroup() %>% - count(!!treat_var, !!!cols, !!!by, name="distinct_n") %>% + count(!!treat_var, !!!cols, !!!by, name="n_present") %>% + # complete based on missing groupings + complete(!!treat_var, !!!cols, !!!by, fill = list(distinct_n = 0)) %>% left_join( - header_n %>% rename(distinct_total = n), by = mrg_vars + header_n %>% rename(header_tots = n), by = mrg_vars ) %>% # Create a variable to label the totals when it is merged in. - mutate(!!as_name(target_var[[1]]) := missing_subjects_row_label) %>% + mutate( + !!as_name(target_var[[1]]) := missing_subjects_row_label, + distinct_n = header_tots - n_present + ) %>% # Create variables to carry forward 'by'. Only pull out the ones that # aren't symbols group_by(!!!extract_character_from_quo(by)) %>% # ungroup right away to make sure the complete works - ungroup() %>% - # complete based on missing groupings - complete(!!treat_var, !!!cols, fill = list(n = 0, total = 0)) + ungroup() }, envir = x) } @@ -461,6 +462,7 @@ process_formatting.count_layer <- function(x, ...) { total_count_format = total_count_format, missing_subjects_count_format = missing_subjects_count_format, total_row_label = total_row_label, + missing_subjects_row_label = missing_subjects_row_label, has_missing_count = has_missing_count) }) %>% # Pivot table @@ -521,13 +523,19 @@ process_formatting.count_layer <- function(x, ...) { #' target variable. #' @param indentation_length If this is a nested count layer. The row prefixes #' must be removed +#' @param total_count_format f_str for total counts +#' @param missing_subjects_count_format f_str for missing subjects +#' @param total_row_label Label string for total rows +#' @param missing_subjects_row_label Label string for missing subjects +#' @param has_missing_count Boolean for if missing counts are present #' #' @return A tibble replacing the original counts #' @noRd construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_total = NULL, count_fmt = NULL, max_layer_length, max_n_width, missing_string, missing_f_str, summary_var, indentation_length, total_count_format, - missing_subjects_count_format, total_row_label, has_missing_count) { + missing_subjects_count_format, total_row_label, missing_subjects_row_label, + has_missing_count) { ## Added this for processing formatting in nested count layers where this won't be processed yet if (is.null(max_layer_length)) max_layer_length <- 0 @@ -581,7 +589,6 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot vars_ord) } - # Logic for missing # Same logic as above, just add for missing missing_str_all <- vector("list", 5) @@ -622,19 +629,16 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot # Put the vector strings together. Only include parts of str_all that aren't null # nm is non-missing, m is missing, t is total, ms is missing subjects - browser() string_nm <- do.call(sprintf, str_all[!map_lgl(str_all, is.null)]) if (!is.null(missing_vars_ord)) string_m <- do.call(sprintf, missing_str_all[!map_lgl(missing_str_all, is.null)]) if (!is.null(total_vars_ord)) string_t <- do.call(sprintf, total_str_all[!map_lgl(total_str_all, is.null)]) if (!is.null(missing_subject_vars_ord)) string_ms <- do.call(sprintf, missing_subs_str_all[!map_lgl(missing_subs_str_all, is.null)]) # string_ is the final string to return. Merge the missing, non-missing, and others together - string_ <- character(length(string_nm) + length(string_m) + length(string_t)) + string_ <- character(sum(length(string_nm), length(string_m), length(string_t), length(string_ms))) string_[rows_] <- string_nm string_[total_rows] <- string_t string_[missing_rows] <- string_m string_[missing_subject_rows] <- string_ms - - browser() # Left pad set to 0 meaning it won't pad to the left at all # right pad is set to the maximum n count in the table string_ <- pad_formatted_data(string_, 0, max_n_width) diff --git a/R/nested.R b/R/nested.R index e05e854c..971a9770 100644 --- a/R/nested.R +++ b/R/nested.R @@ -32,8 +32,11 @@ process_nested_count_target <- function(x) { second_denoms_by <- denoms_by } - first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], - by = vars(!!!by), where = !!where)) + # Missing subject counts should not occur in the outer layer + fl <- group_count(current_env(), target_var = !!target_var[[1]], + by = vars(!!!by), where = !!where) + fl$include_missing_subjects_row <- FALSE + first_layer <- process_summaries(fl) second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% @@ -50,7 +53,8 @@ process_nested_count_target <- function(x) { treat_var = treat_var ) %>% group_by(!!target_var[[1]]) %>% - do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation)) + do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation, + missing_subjects_row_label)) ignored_filter_rows <- ifelse(include_total_row, ifelse(is.null(total_row_label), @@ -85,7 +89,8 @@ process_nested_count_target <- function(x) { #' This function is meant to remove the values of an inner layer that don't #' appear in the target data #' @noRd -filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation) { +filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation, + missing_subjects_row_label) { # Is outer variable text? If it is don't filter on it text_outer <- !quo_is_symbol(outer_name) @@ -108,9 +113,13 @@ filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, in filter(!!sym(outer_name) == current_outer_value) %>% select(any_of(inner_name)) %>% unlist() %>% - paste0(indentation, .) + paste0(indentation, .) %>% + unique() } + target_inner_values <- c(target_inner_values %>% unique(), + paste0(indentation, missing_subjects_row_label)) + .group %>% filter(summary_var %in% target_inner_values) diff --git a/R/sort.R b/R/sort.R index 2d84b0be..e38dcb55 100644 --- a/R/sort.R +++ b/R/sort.R @@ -222,7 +222,9 @@ add_order_columns.count_layer <- function(x) { break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column)) %>% + numeric_cutoff_column = numeric_cutoff_column, + missing_subjects_row_label = missing_subjects_row_label, + missing_subjects_sort_value = missing_subjects_sort_value)) %>% ungroup() if (!is.null(nest_count) && nest_count) { @@ -428,6 +430,9 @@ get_data_order <- function(x, formatted_col_index) { if(!is.null(missing_string)) missing_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_string) if(!is.null(total_row_label)) total_index <- which(unlist(formatted_data[, label_row_ind]) %in% total_row_label) + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_subjects_row_label) + } # No processing is needed here just pass in the needed info get_data_order_bycount(numeric_data, ordering_cols, @@ -611,7 +616,7 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, } if(!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) { - numeric_ordering_data[missing_subjects_sort_value,] <- missing_subjects_sort_value + numeric_ordering_data[missing_subjects_index,] <- missing_subjects_sort_value } # This is the numeric index that the numeric data is in. radix was chosen because @@ -783,11 +788,13 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { missing_subjects_sort_value = missing_subjects_sort_value) } else { - group_row_count <- nrow(group_data[-1,]) # Logic for group_row_count is when numeric_where values cause unexpected results group_data[-1, paste0("ord_layer_", final_col + 1)] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + # missing_subjects_row_label not passing in here + missing_rows <- group_data[[paste0('row_label', final_col+1)]] == paste0(indentation, missing_subjects_row_label) + group_data[missing_rows, paste0("ord_layer_", final_col + 1)] <- missing_subjects_sort_value } group_data From 54d5c1b7e0908598f5dba740e49635d570711100 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 21:34:06 +0000 Subject: [PATCH 18/24] gh_issue_84 merge --- R/nested.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/nested.R b/R/nested.R index 971a9770..bea05511 100644 --- a/R/nested.R +++ b/R/nested.R @@ -37,7 +37,11 @@ process_nested_count_target <- function(x) { by = vars(!!!by), where = !!where) fl$include_missing_subjects_row <- FALSE first_layer <- process_summaries(fl) + outer_ <- TRUE + first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], + by = vars(!!!by), where = !!where)) + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% set_count_row_prefix(indentation) %>% From 89de295110ad7f319c67aa0bec7da82d9a117a56 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 21:34:06 +0000 Subject: [PATCH 19/24] gh_issue_84 merge --- R/nested.R | 4 ++++ R/sort.R | 18 ++---------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/R/nested.R b/R/nested.R index 971a9770..bea05511 100644 --- a/R/nested.R +++ b/R/nested.R @@ -37,7 +37,11 @@ process_nested_count_target <- function(x) { by = vars(!!!by), where = !!where) fl$include_missing_subjects_row <- FALSE first_layer <- process_summaries(fl) + outer_ <- TRUE + first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], + by = vars(!!!by), where = !!where)) + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% set_count_row_prefix(indentation) %>% diff --git a/R/sort.R b/R/sort.R index 22d0c8dc..e9de385b 100644 --- a/R/sort.R +++ b/R/sort.R @@ -804,20 +804,6 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { varn_df <- get_varn_values(target, target_var[[1]]) - - -<<<<<<< HEAD - group_data[-1, paste0("ord_layer_", final_col + 1)] <- get_data_order_byvarn(filtered_group_data, - varn_df, - target_var[[1]], - length(by) + 1, - indentation, - total_row_sort_value = total_row_sort_value, - missing_subjects_sort_value = missing_subjects_sort_value) - - } else { - group_row_count <- nrow(group_data[-1,]) -======= group_data[ (outer_nest_rows + 1): nrow(group_data), paste0("ord_layer_", final_col + 1) @@ -826,10 +812,10 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { target_var[[1]], length(by) + 1, indentation, - total_row_sort_value = total_row_sort_value) + total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) } else { group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) ->>>>>>> gh_issue_84 # Logic for group_row_count is when numeric_where values cause unexpected results group_data[ (outer_nest_rows + 1): nrow(group_data), From a9cc8936c227aca708962edaaf3198d3b0275ba0 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Thu, 8 Feb 2024 22:28:34 +0000 Subject: [PATCH 20/24] start testing and push progress --- R/count.R | 2 +- R/count_bindings.R | 2 +- R/nested.R | 5 +- R/sort.R | 12 ++- tests/testthat/test-count.R | 146 ++++++++++++++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 7 deletions(-) diff --git a/R/count.R b/R/count.R index 0cb48224..270f3d64 100644 --- a/R/count.R +++ b/R/count.R @@ -362,7 +362,7 @@ process_missing_subjects_row <- function(x) { ungroup() %>% count(!!treat_var, !!!cols, !!!by, name="n_present") %>% # complete based on missing groupings - complete(!!treat_var, !!!cols, !!!by, fill = list(distinct_n = 0)) %>% + complete(!!treat_var, !!!cols, !!!by, fill = list(n_present = 0)) %>% left_join( header_n %>% rename(header_tots = n), by = mrg_vars ) %>% diff --git a/R/count_bindings.R b/R/count_bindings.R index a618fd9f..1eb1ddcb 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -767,7 +767,7 @@ set_missing_subjects_row_label <- function(e, missing_subjects_row_label) { assert_has_class(missing_subjects_row_label, "character") assert_that(length(missing_subjects_row_label) == 1) if("shift_layer" %in% class(e)) { - rlang::abort("`missing_subjects_row_label` for shift layers is not yet supported") + rlang::abort("`set_missing_subjects_row_label` for shift layers is not yet supported") } assert_inherits_class(e, "count_layer") diff --git a/R/nested.R b/R/nested.R index bea05511..9843c188 100644 --- a/R/nested.R +++ b/R/nested.R @@ -36,10 +36,9 @@ process_nested_count_target <- function(x) { fl <- group_count(current_env(), target_var = !!target_var[[1]], by = vars(!!!by), where = !!where) fl$include_missing_subjects_row <- FALSE - first_layer <- process_summaries(fl) outer_ <- TRUE - first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], - by = vars(!!!by), where = !!where)) + first_layer <- process_summaries(fl) + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], diff --git a/R/sort.R b/R/sort.R index e9de385b..00d37436 100644 --- a/R/sort.R +++ b/R/sort.R @@ -438,6 +438,7 @@ get_data_order <- function(x, formatted_col_index) { if(!is.null(missing_string)) missing_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_string) if(!is.null(total_row_label)) total_index <- which(unlist(formatted_data[, label_row_ind]) %in% total_row_label) + if(!is.null(missing_subjects_row_label)) { missing_subjects_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_subjects_row_label) } @@ -446,7 +447,8 @@ get_data_order <- function(x, formatted_col_index) { get_data_order_bycount(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index, missing_sort_value, - total_index, total_row_sort_value, missing_subjects_sort_value, + total_index, total_row_sort_value, + missing_subjects_index, missing_subjects_sort_value, break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, @@ -548,6 +550,7 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index = NULL, missing_sort_value = NULL, total_index = NULL, total_row_sort_value = NULL, + missing_subjects_index = NULL, missing_subjects_sort_value = NULL, break_ties, numeric_cutoff, numeric_cutoff_stat, numeric_cutoff_column, nested = FALSE) { @@ -780,12 +783,15 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { mutate(!!row_label_vec[length(row_label_vec)] := str_sub(.data[[row_label_vec[length(row_label_vec)]]], indentation_length + 1)) - + # Pick up here = we need to get the missing_subjects_index, but the label + # isn't so need to find the index and then pass that into get_data_order_bycount + browser() # The first row is always the first thing in the order so make it Inf group_data[1:outer_nest_rows, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) if(tail(order_count_method, 1) == "bycount") { if (nrow(group_data) > 1) { + browser() group_data[ (outer_nest_rows + 1): nrow(group_data), paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, @@ -796,6 +802,8 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { result_order_var, target_var, break_ties = break_ties, + missing_subjects_index = missing_subjects_index, + missing_subjects_sort_value = missing_subjects_sort_value, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, numeric_cutoff_column = numeric_cutoff_column, diff --git a/tests/testthat/test-count.R b/tests/testthat/test-count.R index b4d929f2..f8219061 100644 --- a/tests/testthat/test-count.R +++ b/tests/testthat/test-count.R @@ -945,3 +945,149 @@ test_that("Regression test to make sure cols produce correct denom", { expect_snapshot(t) }) + +test_that("Error checking for add_missing_subjects_row()", { + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row("blah") + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row(f_str("xx", distinct_n), sort_value = "x") + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + }, "`add_missing_subjects_row` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + ) + + ## ---- + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(3) + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(c("x", "y")) + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + set_missing_subjects_row_label("x") + )}, "`set_missing_subjects_row_label` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + set_missing_subjects_row_label("x") + ) + ) + +}) + +test_that("Missing subjects row calculates correctly", { + x <- tplyr_table(tplyr_adlb, TRTA, cols=SEX) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(ANRIND, by = vars(PARAM, AVISIT)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) %>% + build() + + # Check 1 + in_res1 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Blood Urea Nitrogen (mmol/L)", row_label2 == "Week 12") %>% + pull(var1_Placebo_F) %>% + as.numeric() + + pop1 <- tplyr_adsl %>% + filter(TRT01A == "Placebo", SEX == "F") %>% + nrow() + + dat1 <- tplyr_adlb %>% + filter(PARAM == "Blood Urea Nitrogen (mmol/L)", AVISIT == "Week 12", TRTA == "Placebo", SEX == "F") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop1-dat1, in_res1) + + # Check 2 + in_res2 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Gamma Glutamyl Transferase (U/L)", row_label2 == "Week 24") %>% + pull(`var1_Xanomeline Low Dose_M`) %>% + as.numeric() + + pop2 <- tplyr_adsl %>% + filter(TRT01A == "Xanomeline Low Dose", SEX == "M") %>% + nrow() + + dat2 <- tplyr_adlb %>% + filter(PARAM == "Gamma Glutamyl Transferase (U/L)", AVISIT == "Week 24", TRTA == "Xanomeline Low Dose", SEX == "M") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop2-dat2, in_res2) + +}) + +test_that("Missing counts on nested count layers function correctly", { + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) %>% + build() + + expect_equal(nrow(x %>% filter(row_label2 == " Missing")), 1) + expect_equal(tail(x, 1)$ord_layer_2, Inf) + + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) %>% + build() + +}) From 483412085696cfd5dc31751a3ea94e039ef76d02 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Fri, 9 Feb 2024 16:26:52 +0000 Subject: [PATCH 21/24] Finishing testing and remaining updates --- R/count.R | 3 +- R/sort.R | 68 ++++++++++++++++++---------------- tests/testthat/_snaps/count.md | 24 ++++++++++++ tests/testthat/test-count.R | 16 +++++++- 4 files changed, 78 insertions(+), 33 deletions(-) diff --git a/R/count.R b/R/count.R index 270f3d64..c896a930 100644 --- a/R/count.R +++ b/R/count.R @@ -375,7 +375,8 @@ process_missing_subjects_row <- function(x) { # aren't symbols group_by(!!!extract_character_from_quo(by)) %>% # ungroup right away to make sure the complete works - ungroup() + ungroup() %>% + select(-c(n_present, header_tots)) }, envir = x) } diff --git a/R/sort.R b/R/sort.R index 00d37436..d36563a2 100644 --- a/R/sort.R +++ b/R/sort.R @@ -627,7 +627,7 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, } if(!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) { - numeric_ordering_data[missing_subjects_index,] <- missing_subjects_sort_value + numeric_ordering_data[missing_subjects_index,] <- missing_subjects_sort_value } # This is the numeric index that the numeric data is in. radix was chosen because @@ -776,22 +776,24 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { # Only include the parts of the numeric data that is in the current label filter(numeric_data$summary_var %in% present_vars, !!by[[1]] == outer_value) %>% # Remove nesting prefix to prepare numeric data. - mutate(summary_var := str_sub(summary_var, indentation_length)) + mutate(summary_var := str_sub(summary_var, indentation_length+1)) #Same idea here, remove prefix filtered_group_data <- tail(group_data, -outer_nest_rows) %>% mutate(!!row_label_vec[length(row_label_vec)] := str_sub(.data[[row_label_vec[length(row_label_vec)]]], indentation_length + 1)) - # Pick up here = we need to get the missing_subjects_index, but the label - # isn't so need to find the index and then pass that into get_data_order_bycount - browser() + + # Identify the index of missing subjects + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(filtered_group_data[[length(row_label_vec)]] %in% missing_subjects_row_label) + } + # The first row is always the first thing in the order so make it Inf group_data[1:outer_nest_rows, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) if(tail(order_count_method, 1) == "bycount") { if (nrow(group_data) > 1) { - browser() group_data[ (outer_nest_rows + 1): nrow(group_data), paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, @@ -808,31 +810,35 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { numeric_cutoff_stat = numeric_cutoff_stat, numeric_cutoff_column = numeric_cutoff_column, nested = TRUE) - } } else if(tail(order_count_method, 1) == "byvarn") { - - varn_df <- get_varn_values(target, target_var[[1]]) - - group_data[ - (outer_nest_rows + 1): nrow(group_data), - paste0("ord_layer_", final_col + 1) - ] <- get_data_order_byvarn(filtered_group_data, - varn_df, - target_var[[1]], - length(by) + 1, - indentation, - total_row_sort_value = total_row_sort_value, - missing_subjects_sort_value = missing_subjects_sort_value) - } else { - group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) - # Logic for group_row_count is when numeric_where values cause unexpected results - group_data[ - (outer_nest_rows + 1): nrow(group_data), - paste0("ord_layer_", final_col + 1) - ] <- 1:ifelse(group_row_count == 0, 1, group_row_count) - - # missing_subjects_row_label not passing in here - missing_rows <- group_data[[paste0('row_label', final_col+1)]] == paste0(indentation, missing_subjects_row_label) - group_data[missing_rows, paste0("ord_layer_", final_col + 1)] <- missing_subjects_sort_value + } + + } else if(tail(order_count_method, 1) == "byvarn") { + + varn_df <- get_varn_values(target, target_var[[1]]) + + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- get_data_order_byvarn(filtered_group_data, + varn_df, + target_var[[1]], + length(by) + 1, + indentation, + total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) + } else { + group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) + # Logic for group_row_count is when numeric_where values cause unexpected results + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + + # missing_subjects_row_label not passing in here + if (!is.null(missing_subjects_sort_value)) { + missing_rows <- group_data[[paste0('row_label', final_col+1)]] == paste0(indentation, missing_subjects_row_label) + group_data[missing_rows, paste0("ord_layer_", final_col + 1)] <- missing_subjects_sort_value + } } group_data diff --git a/tests/testthat/_snaps/count.md b/tests/testthat/_snaps/count.md index 6045b796..01d31a37 100644 --- a/tests/testthat/_snaps/count.md +++ b/tests/testthat/_snaps/count.md @@ -615,3 +615,27 @@ var1_54_F var1_54_M var1_81_F var1_81_M 1 27 (54.0) [50] 23 (67.6) [34] 17 (42.5) [40] 26 (59.1) [44] +# Error checking for add_missing_subjects_row() + + Argument `fmt` does not inherit "f_str". Classes: character + +--- + + Argument `sort_value` does not inherit "numeric". Classes: character + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + +--- + + Argument `missing_subjects_row_label` must be character. Instead a class of "numeric" was passed. + +--- + + length(missing_subjects_row_label) not equal to 1 + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + diff --git a/tests/testthat/test-count.R b/tests/testthat/test-count.R index f8219061..c8fc41ce 100644 --- a/tests/testthat/test-count.R +++ b/tests/testthat/test-count.R @@ -1077,6 +1077,7 @@ test_that("Missing counts on nested count layers function correctly", { expect_equal(nrow(x %>% filter(row_label2 == " Missing")), 1) expect_equal(tail(x, 1)$ord_layer_2, Inf) + # Verify that bycount works for missing values and sort value is assigned correctly x <- tplyr_table(tplyr_adae, TRTA) %>% set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% @@ -1086,8 +1087,21 @@ test_that("Missing counts on nested count layers function correctly", { set_order_count_method("bycount") %>% set_ordering_cols("Xanomeline High Dose") %>% set_result_order_var(distinct_n) %>% - add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) + ) %>% + build() + + expect_equal(tail(x, 1)$ord_layer_2, 99999) + + # Also test that label reassignment flows + x <- tplyr_table(tplyr_adsl, TRT01A) %>% + add_layer( + group_count(vars(SEX, RACE)) %>% + set_order_count_method(c("byfactor", "byvarn")) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) %>% + set_missing_subjects_row_label("New label") ) %>% build() + expect_equal(filter(x, row_label2 == " New label")$ord_layer_2, c(99999, 99999)) }) From dc2e92c49e2e4bad0d3810a9b5b3638e82ab8fee Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Fri, 9 Feb 2024 17:28:10 +0000 Subject: [PATCH 22/24] R CMD check updates --- R/count_bindings.R | 15 +++++++++++---- R/zzz.R | 3 ++- man/add_missing_subjects_row.Rd | 10 ++++++++-- man/set_missing_subjects_row_label.Rd | 4 ++-- vignettes/denom.Rmd | 26 ++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/R/count_bindings.R b/R/count_bindings.R index 1eb1ddcb..bc86aa84 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -714,8 +714,15 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { #' Add a missing subject row into a count summary. #' -#' -#' @param e A \code{count_layer} object +#' This function calculates the number of subjects missing from a particular +#' group of results. The calculation is done by examining the total number of +#' subjects potentially available from the Header N values within the result +#' column, and finding the difference with the total number of subjects present +#' in the result group. Note that for accurate results, the subject variable +#' needs to be defined using the `set_distinct_by()` function. As with other +#' methods, this function instructs how distinct results should be identified. +#' +#' @param e A `count_layer` object #' @param fmt An f_str object used to format the total row. If none is provided, #' display is based on the layer formatting. #' @param sort_value The value that will appear in the ordering column for total @@ -748,7 +755,7 @@ add_missing_subjects_row <- function(e, fmt = NULL, sort_value = NULL) { #' Set the label for the missing subjects row #' #' @param e A \code{count_layer} object -#' @param total_row_label A character to label the total row +#' @param missing_subjects_row_label A character to label the total row #' #' @return The modified \code{count_layer} object #' @export @@ -759,7 +766,7 @@ add_missing_subjects_row <- function(e, fmt = NULL, sort_value = NULL) { #' add_layer( #' group_count(cyl) %>% #' add_missing_subjects_row() %>% -#' set_missing_subjects_label("Missing") +#' set_missing_subjects_row_label("Missing") #' ) #' build(t) set_missing_subjects_row_label <- function(e, missing_subjects_row_label) { diff --git a/R/zzz.R b/R/zzz.R index b7c53fba..5b1989d4 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -314,4 +314,5 @@ missing_subject_vars_ord <- NULL string_ms <- NULL missing_subjects_sort_value <- NULL limit_data_by <- NULL - +n_present <- NULL +header_tots <- NULL diff --git a/man/add_missing_subjects_row.Rd b/man/add_missing_subjects_row.Rd index cfb873d7..55769f5a 100644 --- a/man/add_missing_subjects_row.Rd +++ b/man/add_missing_subjects_row.Rd @@ -7,7 +7,7 @@ add_missing_subjects_row(e, fmt = NULL, sort_value = NULL) } \arguments{ -\item{e}{A \code{count_layer} object} +\item{e}{A `count_layer` object} \item{fmt}{An f_str object used to format the total row. If none is provided, display is based on the layer formatting.} @@ -16,7 +16,13 @@ display is based on the layer formatting.} rows. This must be a numeric value.} } \description{ -Add a missing subject row into a count summary. +This function calculates the number of subjects missing from a particular +group of results. The calculation is done by examining the total number of +subjects potentially available from the Header N values within the result +column, and finding the difference with the total number of subjects present +in the result group. Note that for accurate results, the subject variable +needs to be defined using the `set_distinct_by()` function. As with other +methods, this function instructs how distinct results should be identified. } \examples{ diff --git a/man/set_missing_subjects_row_label.Rd b/man/set_missing_subjects_row_label.Rd index ed75af9e..bcc943a1 100644 --- a/man/set_missing_subjects_row_label.Rd +++ b/man/set_missing_subjects_row_label.Rd @@ -9,7 +9,7 @@ set_missing_subjects_row_label(e, missing_subjects_row_label) \arguments{ \item{e}{A \code{count_layer} object} -\item{total_row_label}{A character to label the total row} +\item{missing_subjects_row_label}{A character to label the total row} } \value{ The modified \code{count_layer} object @@ -23,7 +23,7 @@ t <- tplyr_table(mtcars, gear) \%>\% add_layer( group_count(cyl) \%>\% add_missing_subjects_row() \%>\% - set_missing_subjects_label("Missing") + set_missing_subjects_row_label("Missing") ) build(t) } diff --git a/vignettes/denom.Rmd b/vignettes/denom.Rmd index af221fa5..18d2a9c9 100644 --- a/vignettes/denom.Rmd +++ b/vignettes/denom.Rmd @@ -233,6 +233,32 @@ t %>% We did one more other thing worth explaining in the example above - gave the missing count its own sort value. If you leave this field null, it will simply be the maximum value in the order layer plus 1, to put the Missing counts at the bottom during an ascending sort. But tables can be sorted a lot of different ways, as you'll see in the sort vignette. So instead of trying to come up with novel ways for you to control where the missing row goes - we decided to just let you specify your own value. +## Missing Subjects + +Missing counts and counting missing subjects work two different ways within Tplyr. Missing counts, as described above, will examine the records present in the data and collect and missing values. But for these results to be counted, they need to first be provided within the input data itself. On the other hand, missing subjects are calculated by looking at the difference between the potential number of subjects within the column (i.e. the combination of the treatment variables and column variables) and the number of subjects actually present. Consider this example: + +```{r missing_subs1} + missing_subs <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_nest_count(TRUE) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) %>% + set_missing_subjects_row_label("Missing Subjects") + ) %>% + build() + + tail(missing_subs) %>% + select(-starts_with('ord')) %>% + kable() +``` + +In the example above, we produce a nested count layer. Using the function `add_missing_subjects_row()`. This triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default for Missing. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. + +Note that in nested count layers, missing subject rows will generate for each independent group within the outer layer. Outer layers cannot have missing subject rows calculated individually. This would best be done in an independent layer itself, as the result would apply to the whole input target dataset. + ## Adding a 'Total' Row In addition to missing counts, some summaries require the addition of a 'Total' row. **Tplyr** has the helper function `add_total_row()` to ease this process for you. Like most other things within **Tplyr** - particularly in this vignette - this too has a significant bit of nuance to it. From 8d70c50be433d0bcaac2cdeb623170c73f8ac003 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Mon, 12 Feb 2024 19:03:14 +0000 Subject: [PATCH 23/24] Update comments --- vignettes/denom.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/denom.Rmd b/vignettes/denom.Rmd index 18d2a9c9..cb657c4e 100644 --- a/vignettes/denom.Rmd +++ b/vignettes/denom.Rmd @@ -255,7 +255,7 @@ Missing counts and counting missing subjects work two different ways within Tply kable() ``` -In the example above, we produce a nested count layer. Using the function `add_missing_subjects_row()`. This triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default for Missing. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. +In the example above, we produce a nested count layer. The function `add_missing_subjects_row()` triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default to 'Missing'. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. Note that in nested count layers, missing subject rows will generate for each independent group within the outer layer. Outer layers cannot have missing subject rows calculated individually. This would best be done in an independent layer itself, as the result would apply to the whole input target dataset. From b261356f15b9830ee28c23f1f89ae5e54f79d091 Mon Sep 17 00:00:00 2001 From: "mike.stackhouse" Date: Tue, 13 Feb 2024 19:45:21 +0000 Subject: [PATCH 24/24] PR review comments. --- R/count.R | 1 - R/desc.R | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/R/count.R b/R/count.R index c896a930..49a6e15f 100644 --- a/R/count.R +++ b/R/count.R @@ -108,7 +108,6 @@ process_summaries.count_layer <- function(x, ...) { process_count_denoms(x) - outer <- FALSE process_single_count_target(x) } diff --git a/R/desc.R b/R/desc.R index 829da546..40e36686 100644 --- a/R/desc.R +++ b/R/desc.R @@ -59,22 +59,6 @@ process_summaries.desc_layer <- function(x, ...) { ungroup() num_sums_raw[[i]] <- complete_and_limit(cmplt1, treat_var, by, cols, limit_data_by=limit_data_by) - # num_sums_raw[[i]] <- cmplt1 %>% - # # complete all combinations of factors to include combinations that don't exist. - # # add 0 for combinations that don't exist - # complete(!!treat_var, !!!by, !!!cols) - # - # # Apply data limits specified by setter - # if (exists("limit_data_by")) { - # # Find the combinations actually in the data - # groups_in_data <- cmplt1 %>% - # distinct(!!!limit_data_by) - # - # # Join back to limit the completed levels based on the preferred - # # data driven ones - # num_sums_raw[[i]] <- groups_in_data %>% - # left_join(num_sums_raw[[i]], by = map_chr(limit_data_by, as_name)) - # } # Create the transposed summary data to prepare for formatting trans_sums[[i]] <- num_sums_raw[[i]] %>%