diff --git a/NAMESPACE b/NAMESPACE index d2b779eb..c1e7f2a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,9 @@ S3method(set_denoms_by,shift_layer) S3method(set_format_strings,count_layer) S3method(set_format_strings,desc_layer) S3method(set_format_strings,shift_layer) +S3method(set_limit_data_by,count_layer) +S3method(set_limit_data_by,desc_layer) +S3method(set_limit_data_by,shift_layer) S3method(set_where,tplyr_layer) S3method(set_where,tplyr_table) S3method(str,f_str) @@ -45,6 +48,7 @@ export(add_column_headers) export(add_filters) export(add_layer) export(add_layers) +export(add_missing_subjects_row) export(add_risk_diff) export(add_total_group) export(add_total_row) @@ -101,7 +105,9 @@ export(set_distinct_by) export(set_format_strings) export(set_header_n) export(set_indentation) +export(set_limit_data_by) export(set_missing_count) +export(set_missing_subjects_row_label) export(set_nest_count) export(set_numeric_threshold) export(set_order_count_method) @@ -139,6 +145,7 @@ importFrom(dplyr,between) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,count) importFrom(dplyr,cur_column) importFrom(dplyr,cur_group) importFrom(dplyr,desc) diff --git a/R/count.R b/R/count.R index 030db51d..49a6e15f 100644 --- a/R/count.R +++ b/R/count.R @@ -39,6 +39,13 @@ process_summaries.count_layer <- function(x, ...) { sep = " "))) } + # Do this here to make sure that defaults are available everywhere else + # Downstream + if (is.null(include_total_row)) include_total_row <- FALSE + if (is.null(total_row_label)) total_row_label <- "Total" + if (is.null(include_missing_subjects_row)) include_missing_subjects_row <- FALSE + if (is.null(missing_subjects_row_label)) missing_subjects_row_label <- "Missing" + # Save this for the denominator where, but only if it hasn't been saved yet. if (is.null(built_target_pre_where)) built_target_pre_where <- built_target @@ -121,15 +128,12 @@ process_summaries.count_layer <- function(x, ...) { #' If include_total_row is true a row will be added with a total row labeled #' with total_row_label. #' -#' Complete is used to complete the combinaions of by, treat_var, and target_var +#' Complete is used to complete the combinations of by, treat_var, and target_var #' #' @noRd process_single_count_target <- function(x) { evalq({ - if (is.null(include_total_row)) include_total_row <- FALSE - if (is.null(total_row_label)) total_row_label <- "Total" - # The current environment should be the layer itself process_count_n(current_env()) @@ -150,6 +154,10 @@ process_single_count_target <- function(x) { } } + if (include_missing_subjects_row) { + process_missing_subjects_row(current_env()) + } + if (is.null(count_row_prefix)) count_row_prefix <- "" # If a denoms variable is factor then it should be character for the denoms calculations @@ -186,26 +194,13 @@ process_single_count_target <- function(x) { } # rbind tables together - numeric_data <- summary_stat %>% - bind_rows(total_stat) %>% + numeric_data <- bind_rows(summary_stat, total_stat, missing_subjects_stat) %>% rename("summary_var" = !!target_var[[1]]) %>% group_by(!!!denoms_by) %>% do(get_denom_total(., denoms_by, denoms_df_prep, "n")) %>% mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>% ungroup() - - if (!is.null(distinct_stat)) { - if (include_total_row) { - distinct_stat <- distinct_stat %>% - bind_rows(total_stat_denom) %>% - group_by(!!!denoms_by) %>% - do(get_denom_total(., denoms_by, denoms_df, "distinct_n")) - } - numeric_data <- bind_cols(numeric_data, - distinct_stat[, c("distinct_n", "distinct_total")]) - } - rm(denoms_df_prep, fct_cols) }, envir = x) @@ -256,11 +251,15 @@ process_count_n <- function(x) { names(missing_count_list))) } - summary_stat <- summary_stat %>% - # complete all combinations of factors to include combinations that don't exist. - # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!target_var, !!!cols, - fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) %>% + # Need to mark this for nested counts + if (!exists('outer_')) outer_ <- FALSE + + complete_levels <- summary_stat %>% + complete_and_limit(treat_var, by, cols, target_var, limit_data_by, + .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0), + outer=outer_) + + summary_stat <- complete_levels %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% @@ -275,6 +274,25 @@ process_count_n <- function(x) { } + +#' Get Logical vector that is used to remove the treat_var and cols +#' +#' In total row and missing subject counts, denoms_by needs to be stripped of +#' cols and treat_var variables, otherwise it will error out in the group_by +#' +#' @param denoms_by The layer denoms by +#' @param treat_var table treat var +#' @param cols tables cols vars +#' +#' @return list of quosures +#' @noRd +get_needed_denoms_by <- function(denoms_by, treat_var, cols) { + map_lgl(denoms_by, function(x, treat_var, cols) { + all(as_name(x) != as_name(treat_var), + as_name(x) != map_chr(cols, as_name)) + }, treat_var, cols) +} + #' Process the amounts for a total row #' #' @param x A Count layer @@ -290,16 +308,8 @@ no denoms_by variable was set. This may cause unexpected results. If you wish to change this behavior, use `set_denoms_by()`.", immediate. = TRUE) } - # Make sure the denoms_by is stripped - # Stripped of cols and treat_var variables, otherwise it will error out in the group_by - # I thought of replacing the group by with !!!unique(c(treat_var, cols, denoms_by)) - # but that doesn't work due to the denoms_by having an environment set - # Logical vector that is used to remove the treat_var and cols - needed_denoms_by <- map_lgl(denoms_by, function(x, treat_var, cols) { - all(as_name(x) != as_name(treat_var), - as_name(x) != map_chr(cols, as_name)) - }, treat_var, cols) + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) #Create an expression to evaluate filter if (!count_missings) { @@ -326,9 +336,46 @@ change this behavior, use `set_denoms_by()`.", immediate. = TRUE) # aren't symbols group_by(!!!extract_character_from_quo(by)) %>% # ungroup right away to make sure the complete works + ungroup() + }, envir = x) +} + +#' Process the amounts for a missing subjects row +#' +#' @param x A Count layer +#' @noRd +process_missing_subjects_row <- function(x) { + evalq({ + + # Logical vector that is used to remove the treat_var and cols + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) + + # Create the merge variables to join the header_n data + mrg_vars <- map_chr(c(pop_treat_var, cols, denoms_by[needed_denoms_by]), as_name) + names(mrg_vars)[1] <- as_name(treat_var) + # create a data.frame to create total counts + missing_subjects_stat <- built_target %>% + # Use distinct if this is a distinct total row + # Group by all column variables + distinct(!!treat_var, !!!cols, !!!by, !!!distinct_by) %>% ungroup() %>% + count(!!treat_var, !!!cols, !!!by, name="n_present") %>% # complete based on missing groupings - complete(!!treat_var, !!!cols, fill = list(n = 0, total = 0)) + complete(!!treat_var, !!!cols, !!!by, fill = list(n_present = 0)) %>% + left_join( + header_n %>% rename(header_tots = n), by = mrg_vars + ) %>% + # Create a variable to label the totals when it is merged in. + mutate( + !!as_name(target_var[[1]]) := missing_subjects_row_label, + distinct_n = header_tots - n_present + ) %>% + # Create variables to carry forward 'by'. Only pull out the ones that + # aren't symbols + group_by(!!!extract_character_from_quo(by)) %>% + # ungroup right away to make sure the complete works + ungroup() %>% + select(-c(n_present, header_tots)) }, envir = x) } @@ -418,7 +465,9 @@ process_formatting.count_layer <- function(x, ...) { summary_var = summary_var, indentation_length = indentation_length, total_count_format = total_count_format, + missing_subjects_count_format = missing_subjects_count_format, total_row_label = total_row_label, + missing_subjects_row_label = missing_subjects_row_label, has_missing_count = has_missing_count) }) %>% # Pivot table @@ -479,19 +528,26 @@ process_formatting.count_layer <- function(x, ...) { #' target variable. #' @param indentation_length If this is a nested count layer. The row prefixes #' must be removed +#' @param total_count_format f_str for total counts +#' @param missing_subjects_count_format f_str for missing subjects +#' @param total_row_label Label string for total rows +#' @param missing_subjects_row_label Label string for missing subjects +#' @param has_missing_count Boolean for if missing counts are present #' #' @return A tibble replacing the original counts #' @noRd construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_total = NULL, count_fmt = NULL, max_layer_length, max_n_width, missing_string, missing_f_str, summary_var, indentation_length, total_count_format, - total_row_label, has_missing_count) { + missing_subjects_count_format, total_row_label, missing_subjects_row_label, + has_missing_count) { ## Added this for processing formatting in nested count layers where this won't be processed yet if (is.null(max_layer_length)) max_layer_length <- 0 if (is.null(max_n_width)) max_n_width <- 0 missing_rows <- FALSE total_rows <- FALSE + missing_subject_rows <- FALSE # Add in the missing format if its null and there are missing counts if (has_missing_count && is.null(missing_f_str)) { @@ -515,6 +571,12 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord <- map_chr(total_count_format$vars, as_name) } + ## Pull out string information for missing subject rows + if (!is.null(missing_subjects_count_format)) { + missing_subject_rows <- summary_var %in% missing_subjects_row_label + missing_subject_vars_ord <- map_chr(missing_subjects_count_format$vars, as_name) + } + vars_ord <- map_chr(count_fmt$vars, as_name) # str_all is a list that contains character vectors for each parameter that might be calculated @@ -522,12 +584,16 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot # Append the repl_str to be passed to do.call str_all[1] <- count_fmt$repl_str # Iterate over every variable + rows_ <- !missing_rows & !total_rows & !missing_subject_rows for (i in seq_along(vars_ord)) { - str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, .n[!missing_rows & !total_rows], .total[!missing_rows & !total_rows], - .distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord) + str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, + .n[rows_], + .total[rows_], + .distinct_n[rows_], + .distinct_total[rows_], + vars_ord) } - # Logic for missing # Same logic as above, just add for missing missing_str_all <- vector("list", 5) @@ -554,20 +620,30 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord) } + missing_subs_str_all <- vector("list", 5) + missing_subs_str_all[1] <- missing_subjects_count_format$repl_str + for (i in seq_along(missing_subject_vars_ord)) { + missing_subs_str_all[[i + 1]] <- count_string_switch_help(missing_subject_vars_ord[i], + missing_subjects_count_format, + .n[missing_subject_rows], + .total[missing_subject_rows], + .distinct_n[missing_subject_rows], + .distinct_total[missing_subject_rows], + missing_subject_vars_ord) + } + # Put the vector strings together. Only include parts of str_all that aren't null - # nm is non-missing, m is mising, and t is total. + # nm is non-missing, m is missing, t is total, ms is missing subjects string_nm <- do.call(sprintf, str_all[!map_lgl(str_all, is.null)]) if (!is.null(missing_vars_ord)) string_m <- do.call(sprintf, missing_str_all[!map_lgl(missing_str_all, is.null)]) if (!is.null(total_vars_ord)) string_t <- do.call(sprintf, total_str_all[!map_lgl(total_str_all, is.null)]) + if (!is.null(missing_subject_vars_ord)) string_ms <- do.call(sprintf, missing_subs_str_all[!map_lgl(missing_subs_str_all, is.null)]) # string_ is the final string to return. Merge the missing, non-missing, and others together - string_ <- character(length(string_nm) + length(string_m) + length(string_t)) - string_[!missing_rows & !total_rows] <- string_nm + string_ <- character(sum(length(string_nm), length(string_m), length(string_t), length(string_ms))) + string_[rows_] <- string_nm string_[total_rows] <- string_t string_[missing_rows] <- string_m - - - - + string_[missing_subject_rows] <- string_ms # Left pad set to 0 meaning it won't pad to the left at all # right pad is set to the maximum n count in the table string_ <- pad_formatted_data(string_, 0, max_n_width) diff --git a/R/count_bindings.R b/R/count_bindings.R index 197c6757..bc86aa84 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -711,3 +711,74 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { e } + +#' Add a missing subject row into a count summary. +#' +#' This function calculates the number of subjects missing from a particular +#' group of results. The calculation is done by examining the total number of +#' subjects potentially available from the Header N values within the result +#' column, and finding the difference with the total number of subjects present +#' in the result group. Note that for accurate results, the subject variable +#' needs to be defined using the `set_distinct_by()` function. As with other +#' methods, this function instructs how distinct results should be identified. +#' +#' @param e A `count_layer` object +#' @param fmt An f_str object used to format the total row. If none is provided, +#' display is based on the layer formatting. +#' @param sort_value The value that will appear in the ordering column for total +#' rows. This must be a numeric value. +#' +#' @export +#' @examples +#' +#' tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row(f_str("xxxx", n)) +#' ) %>% +#' build() +add_missing_subjects_row <- function(e, fmt = NULL, sort_value = NULL) { + if(!is.null(fmt)) assert_inherits_class(fmt, "f_str") + if(!is.null(sort_value)) assert_inherits_class(sort_value, "numeric") + if("shift_layer" %in% class(e)) { + rlang::abort("`add_missing_subjects_row` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + env_bind(e, include_missing_subjects_row = TRUE) + env_bind(e, missing_subjects_count_format = fmt) + env_bind(e, missing_subjects_sort_value = sort_value) + + e +} + +#' Set the label for the missing subjects row +#' +#' @param e A \code{count_layer} object +#' @param missing_subjects_row_label A character to label the total row +#' +#' @return The modified \code{count_layer} object +#' @export +#' +#' @examples +#' +#' t <- tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row() %>% +#' set_missing_subjects_row_label("Missing") +#' ) +#' build(t) +set_missing_subjects_row_label <- function(e, missing_subjects_row_label) { + + assert_has_class(missing_subjects_row_label, "character") + assert_that(length(missing_subjects_row_label) == 1) + if("shift_layer" %in% class(e)) { + rlang::abort("`set_missing_subjects_row_label` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + env_bind(e, missing_subjects_row_label = missing_subjects_row_label) + + e +} diff --git a/R/data.R b/R/data.R index 25332b39..2627ef3f 100644 --- a/R/data.R +++ b/R/data.R @@ -47,6 +47,14 @@ #' "tplyr_adlb" +#' ADPE Data +#' +#' A mock-up dataset that is fit for testing data limiting +#' +#' @format A data.frame with 21 rows and 8 columns. +#' +#' +"tplyr_adpe" #' Get Data Labels #' diff --git a/R/desc.R b/R/desc.R index 00922c6a..40e36686 100644 --- a/R/desc.R +++ b/R/desc.R @@ -49,16 +49,16 @@ process_summaries.desc_layer <- function(x, ...) { summaries <- get_summaries()[match_exact(summary_vars)] # Create the numeric summary data - num_sums_raw[[i]] <- built_target %>% + cmplt1 <- built_target %>% # Rename the current variable to make each iteration use a generic name rename(.var = !!cur_var) %>% # Group by treatment, provided by variable, and provided column variables group_by(!!treat_var, !!!by, !!!cols) %>% # Execute the summaries summarize(!!!summaries) %>% - ungroup() %>% - # Fill in any missing treat/col combinations - complete(!!treat_var, !!!by, !!!cols) + ungroup() + + num_sums_raw[[i]] <- complete_and_limit(cmplt1, treat_var, by, cols, limit_data_by=limit_data_by) # Create the transposed summary data to prepare for formatting trans_sums[[i]] <- num_sums_raw[[i]] %>% diff --git a/R/nested.R b/R/nested.R index e05e854c..9843c188 100644 --- a/R/nested.R +++ b/R/nested.R @@ -32,9 +32,15 @@ process_nested_count_target <- function(x) { second_denoms_by <- denoms_by } - first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], - by = vars(!!!by), where = !!where)) + # Missing subject counts should not occur in the outer layer + fl <- group_count(current_env(), target_var = !!target_var[[1]], + by = vars(!!!by), where = !!where) + fl$include_missing_subjects_row <- FALSE + outer_ <- TRUE + first_layer <- process_summaries(fl) + + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% set_count_row_prefix(indentation) %>% @@ -50,7 +56,8 @@ process_nested_count_target <- function(x) { treat_var = treat_var ) %>% group_by(!!target_var[[1]]) %>% - do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation)) + do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation, + missing_subjects_row_label)) ignored_filter_rows <- ifelse(include_total_row, ifelse(is.null(total_row_label), @@ -85,7 +92,8 @@ process_nested_count_target <- function(x) { #' This function is meant to remove the values of an inner layer that don't #' appear in the target data #' @noRd -filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation) { +filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation, + missing_subjects_row_label) { # Is outer variable text? If it is don't filter on it text_outer <- !quo_is_symbol(outer_name) @@ -108,9 +116,13 @@ filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, in filter(!!sym(outer_name) == current_outer_value) %>% select(any_of(inner_name)) %>% unlist() %>% - paste0(indentation, .) + paste0(indentation, .) %>% + unique() } + target_inner_values <- c(target_inner_values %>% unique(), + paste0(indentation, missing_subjects_row_label)) + .group %>% filter(summary_var %in% target_inner_values) diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R new file mode 100644 index 00000000..321c3366 --- /dev/null +++ b/R/set_limit_data_by.R @@ -0,0 +1,134 @@ +#' Set variables to limit reported data values only to those that exist rather +#' than fully completing all possible levels +#' +#' This function allows you to select a combination of by variables or +#' potentially target variables for which you only want to display values +#' present in the data. By default, Tplyr will create a cartesian combination of +#' potential values of the data. For example, if you have 2 by variables +#' present, then each potential combination of those by variables will have a +#' row present in the final table. `set_limit_data_by()` allows you to choose +#' the by variables whose combination you wish to limit to values physically +#' present in the available data. +#' +#' @param e A tplyr_layer +#' @param ... Subset of variables within by or target variables +#' +#' @return a tplyr_table +#' @md +#' @export +#' +#' @examples +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PECAT, PARAM, AVISIT) +#' ) %>% +#' build() +set_limit_data_by <- function(e, ...) { + UseMethod("set_limit_data_by") +} + +#' @export +#' @noRd +set_limit_data_by.count_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + tv_ <- map_chr(env_get(e, "target_var"), as_name) + + if (!all(dots_chr %in% c(by_, tv_))) { + stop("Limit data by variables must be included in by variables or target variable set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' @export +#' @noRd +set_limit_data_by.shift_layer <- function(e, ...) { + set_limit_data_by.count_layer(e, ...) +} + +#' @export +#' @noRd +set_limit_data_by.desc_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + + if (!all(dots_chr %in% by_)) { + stop("Limit data by variables must be included in by variables set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' General function used to process the steps to pad levels in data, or limit to +#' combinations available in the data itself +#' +#' @param dat Input dataset +#' @param treat_var treat_var from tplyr_table +#' @param by by from tplyr_layer +#' @param cols cols from tplyr_table +#' @param target_var target_var from tplyr_layer +#' @param limit_data_by The variables to limit data by +#' @param .fill .fill parameter passed onto dplyr::complete +#' @param outer Whether to bypass variables if working through the outer layer +#' +#' @noRd +complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list(), outer=FALSE) { + + complete_levels <- dat %>% + # complete all combinations of factors to include combinations that don't exist. + # add 0 for combinations that don't exist + complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, + fill = .fill) + + # Apply data limits specified by setter + if (!is.null(limit_data_by)) { + # Outer layer won't have the target variable to limit by + if (outer) { + limit_data_by <- limit_data_by[map_chr(limit_data_by, as_name) %in% names(dat)] + } + + # Find the combinations actually in the data + groups_in_data <- dat %>% + distinct(!!!limit_data_by) + + # Join back to limit the completed levels based on the preferred + # data driven ones + limited_data <- groups_in_data %>% + left_join(complete_levels, by = map_chr(limit_data_by, as_name)) + + return(limited_data) + } + + complete_levels +} diff --git a/R/shift.R b/R/shift.R index a6a38d7b..c6a42d9f 100644 --- a/R/shift.R +++ b/R/shift.R @@ -39,7 +39,6 @@ process_summaries.shift_layer <- function(x, ...) { process_shift_n <- function(x) { evalq({ - numeric_data <- built_target %>% # Group by variables including target variables and count them group_by(!!treat_var, !!!by, !!!unname(target_var), !!!cols) %>% @@ -47,7 +46,9 @@ process_shift_n <- function(x) { ungroup() %>% # complete all combinations of factors to include combinations that don't exist. # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + # complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + complete_and_limit(treat_var, by, cols, unname(target_var), + limit_data_by, .fill = list(n = 0)) %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% diff --git a/R/sort.R b/R/sort.R index 1409f713..d36563a2 100644 --- a/R/sort.R +++ b/R/sort.R @@ -194,20 +194,28 @@ add_order_columns.count_layer <- function(x) { expr(!!sym(as_name(x)) == !!as_name(y)) }) + # Get the number of unique outer values, that is the number of rows to pull out. # If its text, it is just 1 to pull out - outer_number <- ifelse(quo_is_symbol(by[[1]]), - # Use built_target here to take the 'where' logic into account - length(unlist(unique(built_target[, as_name(by[[1]])]))), - 1) + # outer_number <- ifelse(quo_is_symbol(by[[1]]), + # # Use built_target here to take the 'where' logic into account + # nrow(filter(numeric_data, is.na(!!by[[1]]))), + # 1) + + # Identify the outer layer and attach it to the filter logic + filter_logic <- append(filter_logic, ifelse( + quo_is_symbol(by[[1]]), # Is the outside variable character or a symbol? + exprs(is.na(!!by[[1]])), # For symbols, the outer var will be NA + exprs(summary_var == !!by[[1]]) # For character, it will match summary_var + )) all_outer <- numeric_data %>% - filter(!!!filter_logic) %>% - extract(1:min(nrow(.), outer_number), ) + filter(!!!filter_logic) # Add the ordering of the pieces in the layer formatted_data <- formatted_data %>% - group_by(.data[[paste0("row_label", formatted_col_index - 1)]]) %>% + # group_by(.data[[paste0("row_label", formatted_col_index - 1)]]) %>% + group_by(row_label1) %>% do(add_data_order_nested(., formatted_col_index - 1, numeric_data, indentation_length = indentation_length, ordering_cols = ordering_cols, @@ -222,7 +230,9 @@ add_order_columns.count_layer <- function(x) { break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column)) %>% + numeric_cutoff_column = numeric_cutoff_column, + missing_subjects_row_label = missing_subjects_row_label, + missing_subjects_sort_value = missing_subjects_sort_value)) %>% ungroup() if (!is.null(nest_count) && nest_count) { @@ -352,7 +362,8 @@ add_order_columns.shift_layer <- function(x) { # The logic is the same now for a byvarn so reuse that function formatted_data[, paste0("ord_layer_", formatted_col_index)] <- get_data_order_byvarn(formatted_data, fact_df, as_name(target_var$row), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) rm(formatted_col_index) @@ -428,11 +439,16 @@ get_data_order <- function(x, formatted_col_index) { if(!is.null(missing_string)) missing_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_string) if(!is.null(total_row_label)) total_index <- which(unlist(formatted_data[, label_row_ind]) %in% total_row_label) + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_subjects_row_label) + } + # No processing is needed here just pass in the needed info get_data_order_bycount(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index, missing_sort_value, total_index, total_row_sort_value, + missing_subjects_index, missing_subjects_sort_value, break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, @@ -466,7 +482,8 @@ get_data_order <- function(x, formatted_col_index) { } get_data_order_byvarn(formatted_data, varn_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) # Here it is 'byfactor' @@ -521,7 +538,8 @@ get_data_order <- function(x, formatted_col_index) { # The logic is the same now for a byvarn so reuse that function get_data_order_byvarn(formatted_data, fact_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) } }, envir = x) } @@ -532,6 +550,8 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index = NULL, missing_sort_value = NULL, total_index = NULL, total_row_sort_value = NULL, + missing_subjects_index = NULL, + missing_subjects_sort_value = NULL, break_ties, numeric_cutoff, numeric_cutoff_stat, numeric_cutoff_column, nested = FALSE) { @@ -606,6 +626,10 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, numeric_ordering_data[total_index,] <- total_row_sort_value } + if(!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) { + numeric_ordering_data[missing_subjects_index,] <- missing_subjects_sort_value + } + # This is the numeric index that the numeric data is in. radix was chosen because # its the only method that gives indicies as far as I can tell # x are the values @@ -632,7 +656,8 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, } get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_index, - indentation = "", total_row_sort_value = NULL) { + indentation = "", total_row_sort_value = NULL, + missing_subjects_sort_value = NULL) { # Pull out the by values in the formatted data. by_values <- unlist(formatted_data[, by_column_index]) @@ -650,6 +675,8 @@ get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_ # Flag to determine where total row is positioned if(!is.null(total_row_sort_value)) { total_row_sort_value + } else if (!is.null(missing_subjects_sort_value)){ + missing_subjects_sort_value } else { max(by_varn_df[,2]) + 1 } @@ -694,20 +721,19 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { ##### Outer nest values ##### # The value of the outer label outer_value <- group_data[1, tail(row_label_vec, 1)][[1]] + # Reserve for joins + mrg_by <- paste0("row_label", seq_along(by))[-1] if(order_count_method[1] == "byvarn") { varn_df <- get_varn_values(target, as_name(by[[1]])) all_outer$..index <- group_data[1,] %>% - get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value) + get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) - } else if(order_count_method[1] == "bycount") { + } else if(order_count_method[1] == "bycount") { all_outer$..index <- all_outer %>% get_data_order_bycount(ordering_cols, treat_var, vars(!!!head(by, -1)), cols, result_order_var, vars(!!by[[1]], !!target_var), @@ -716,65 +742,103 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { numeric_cutoff_stat = numeric_cutoff_stat, numeric_cutoff_column = numeric_cutoff_column, nested = TRUE) + } + - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) + # Grab the index created above and insert it into group data + if (order_count_method[1] %in% c("bycount", "byvarn")){ + if (length(mrg_by) == 0) { + group_data[,"ord_layer_1"] <- all_outer %>% + filter(summary_var == outer_value) %>% + ungroup() %>% + pull(..index) + } else { + group_data[,"ord_layer_1"] <- group_data %>% + left_join( + all_outer %>% + filter(summary_var == outer_value) %>% + replace_by_string_names(c(by, quo(summary_var))) %>% + select(starts_with('row'), ..index, -c(row_label1, !!treat_var)), + by = mrg_by + ) %>% + pull(..index) + } } - present_vars <- unlist(group_data[-1, row_label_vec[length(row_label_vec)]]) + outer_nest_rows <- group_data %>% + filter(!!sym(tail(row_label_vec, 1)) == outer_value) %>% + nrow() + + present_vars <- group_data[(outer_nest_rows + 1): nrow(group_data),][[row_label_vec[length(row_label_vec)]]] ##### Inner nest values ##### filtered_numeric_data <- numeric_data %>% # Only include the parts of the numeric data that is in the current label filter(numeric_data$summary_var %in% present_vars, !!by[[1]] == outer_value) %>% # Remove nesting prefix to prepare numeric data. - mutate(summary_var := str_sub(summary_var, indentation_length)) - + mutate(summary_var := str_sub(summary_var, indentation_length+1)) #Same idea here, remove prefix - filtered_group_data <- group_data[-1, ] %>% + filtered_group_data <- tail(group_data, -outer_nest_rows) %>% mutate(!!row_label_vec[length(row_label_vec)] := str_sub(.data[[row_label_vec[length(row_label_vec)]]], indentation_length + 1)) + + # Identify the index of missing subjects + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(filtered_group_data[[length(row_label_vec)]] %in% missing_subjects_row_label) + } + # The first row is always the first thing in the order so make it Inf - group_data[1, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) + group_data[1:outer_nest_rows, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) if(tail(order_count_method, 1) == "bycount") { if (nrow(group_data) > 1) { - group_data[-1 , paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, - ordering_cols, - treat_var, - head(by, -1), - cols, - result_order_var, - target_var, - break_ties = break_ties, - numeric_cutoff = numeric_cutoff, - numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column, - nested = TRUE) - } - } else if(tail(order_count_method, 1) == "byvarn") { - - varn_df <- get_varn_values(target, target_var[[1]]) - - - - group_data[-1, paste0("ord_layer_", final_col + 1)] <- get_data_order_byvarn(filtered_group_data, - varn_df, - target_var[[1]], - length(by) + 1, - indentation, - total_row_sort_value = total_row_sort_value) - - } else { - - group_row_count <- nrow(group_data[-1,]) - # Logic for group_row_count is when numeric_where values cause unexpected results - group_data[-1, paste0("ord_layer_", final_col + 1)] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, + ordering_cols, + treat_var, + head(by, -1), + cols, + result_order_var, + target_var, + break_ties = break_ties, + missing_subjects_index = missing_subjects_index, + missing_subjects_sort_value = missing_subjects_sort_value, + numeric_cutoff = numeric_cutoff, + numeric_cutoff_stat = numeric_cutoff_stat, + numeric_cutoff_column = numeric_cutoff_column, + nested = TRUE) + } + } else if(tail(order_count_method, 1) == "byvarn") { + + varn_df <- get_varn_values(target, target_var[[1]]) + + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- get_data_order_byvarn(filtered_group_data, + varn_df, + target_var[[1]], + length(by) + 1, + indentation, + total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) + } else { + group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) + # Logic for group_row_count is when numeric_where values cause unexpected results + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + + # missing_subjects_row_label not passing in here + if (!is.null(missing_subjects_sort_value)) { + missing_rows <- group_data[[paste0('row_label', final_col+1)]] == paste0(indentation, missing_subjects_row_label) + group_data[missing_rows, paste0("ord_layer_", final_col + 1)] <- missing_subjects_sort_value + } } group_data diff --git a/R/utils.R b/R/utils.R index 1187d100..e38ab18f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -314,8 +314,8 @@ ut_round <- function(x, n=0) { # x is the value to be rounded # n is the precision of the rounding - posneg <- sign(x) - e <- abs(x) * 10^n + posneg <- sign(x) + e <- abs(x) * 10^n e <- e + 0.5 + sqrt(.Machine$double.eps) e <- trunc(e) e <- e / 10^n diff --git a/R/zzz.R b/R/zzz.R index 70df2747..5b1989d4 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -11,7 +11,7 @@ #' @importFrom stats IQR median sd quantile var #' @importFrom dplyr n summarize filter vars tally ungroup group_by mutate lag select bind_rows full_join add_tally distinct rowwise #' @importFrom dplyr everything rename mutate_at mutate_all as_tibble bind_cols do case_when arrange left_join row_number between mutate_if -#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches slice_head where desc +#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches slice_head where desc count #' @importFrom tidyr complete nesting pivot_wider pivot_longer replace_na starts_with fill #' @importFrom utils str head tail #' @importFrom tidyselect all_of vars_select any_of @@ -199,7 +199,6 @@ display_string <- NULL built_target <- NULL table_where <- NULL distinct_by <- NULL -distinct_stat <- NULL summary_vars <- NULL trans_vars <- NULL stat <- NULL @@ -277,7 +276,6 @@ missing_sort_value <- NULL missing_index <- NULL total_index <- NULL process_distinct_total <- FALSE -total_stat_denom <- NULL denom_where <- NULL built_target_pre_where <- NULL count_fmt <- NULL @@ -307,4 +305,14 @@ og_row <- NULL desc <- NULL id <- NULL stub_sort <- NULL - +include_missing_subjects_row <- NULL +missing_subjects_row_label <- NULL +missing_subjects_stat <- NULL +missing_subjects_count_format <- NULL +missing_subject_rows <- NULL +missing_subject_vars_ord <- NULL +string_ms <- NULL +missing_subjects_sort_value <- NULL +limit_data_by <- NULL +n_present <- NULL +header_tots <- NULL diff --git a/data-raw/adpe.R b/data-raw/adpe.R new file mode 100644 index 00000000..d80860eb --- /dev/null +++ b/data-raw/adpe.R @@ -0,0 +1,31 @@ +# This adpe dataset is just a mock-up that's fit for purpose to test and demonstrate data limiting +tplyr_adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, ~BASEC, + "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, "Abnormal", + "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, "Semi-Normal", + "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, "Normal", + "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, "Normal", + "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, "Normal", + "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, "Normal", + "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, "Normal", + "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, "Semi-Normal", + "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, "Normal", + "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, "Abnormal", + "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, "Abnormal", + "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, "Abnormal", + + "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, "Normal", + "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, "Normal", + "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, "Semi-Normal", + "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, "Normal", + "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, "Abnormal", + "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, "Abnormal", + "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, "Normal", + "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, "Normal", + "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21, "Normal" +) + +tplyr_adpe$AVALC <- factor(tplyr_adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$BASEC <- factor(tplyr_adpe$BASEC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$AVISIT <- factor(tplyr_adpe$AVISIT, levels = c("Screening", "Day -1", "Day 5")) + +usethis::use_data(tplyr_adpe, overwrite = TRUE) diff --git a/data/tplyr_adpe.rda b/data/tplyr_adpe.rda new file mode 100644 index 00000000..58557959 Binary files /dev/null and b/data/tplyr_adpe.rda differ diff --git a/man/add_missing_subjects_row.Rd b/man/add_missing_subjects_row.Rd new file mode 100644 index 00000000..55769f5a --- /dev/null +++ b/man/add_missing_subjects_row.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{add_missing_subjects_row} +\alias{add_missing_subjects_row} +\title{Add a missing subject row into a count summary.} +\usage{ +add_missing_subjects_row(e, fmt = NULL, sort_value = NULL) +} +\arguments{ +\item{e}{A `count_layer` object} + +\item{fmt}{An f_str object used to format the total row. If none is provided, +display is based on the layer formatting.} + +\item{sort_value}{The value that will appear in the ordering column for total +rows. This must be a numeric value.} +} +\description{ +This function calculates the number of subjects missing from a particular +group of results. The calculation is done by examining the total number of +subjects potentially available from the Header N values within the result +column, and finding the difference with the total number of subjects present +in the result group. Note that for accurate results, the subject variable +needs to be defined using the `set_distinct_by()` function. As with other +methods, this function instructs how distinct results should be identified. +} +\examples{ + +tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row(f_str("xxxx", n)) + ) \%>\% + build() +} diff --git a/man/set_limit_data_by.Rd b/man/set_limit_data_by.Rd new file mode 100644 index 00000000..c9416a76 --- /dev/null +++ b/man/set_limit_data_by.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/set_limit_data_by.R +\name{set_limit_data_by} +\alias{set_limit_data_by} +\title{Set variables to limit reported data values only to those that exist rather +than fully completing all possible levels} +\usage{ +set_limit_data_by(e, ...) +} +\arguments{ +\item{e}{A tplyr_layer} + +\item{...}{Subset of variables within by or target variables} +} +\value{ +a tplyr_table +} +\description{ +This function allows you to select a combination of by variables or +potentially target variables for which you only want to display values +present in the data. By default, Tplyr will create a cartesian combination of +potential values of the data. For example, if you have 2 by variables +present, then each potential combination of those by variables will have a +row present in the final table. \code{set_limit_data_by()} allows you to choose +the by variables whose combination you wish to limit to values physically +present in the available data. +} +\examples{ + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) \%>\% + build() +} diff --git a/man/set_missing_subjects_row_label.Rd b/man/set_missing_subjects_row_label.Rd new file mode 100644 index 00000000..bcc943a1 --- /dev/null +++ b/man/set_missing_subjects_row_label.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{set_missing_subjects_row_label} +\alias{set_missing_subjects_row_label} +\title{Set the label for the missing subjects row} +\usage{ +set_missing_subjects_row_label(e, missing_subjects_row_label) +} +\arguments{ +\item{e}{A \code{count_layer} object} + +\item{missing_subjects_row_label}{A character to label the total row} +} +\value{ +The modified \code{count_layer} object +} +\description{ +Set the label for the missing subjects row +} +\examples{ + +t <- tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row() \%>\% + set_missing_subjects_row_label("Missing") + ) +build(t) +} diff --git a/man/tplyr_adpe.Rd b/man/tplyr_adpe.Rd new file mode 100644 index 00000000..ab19f95d --- /dev/null +++ b/man/tplyr_adpe.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adpe} +\alias{tplyr_adpe} +\title{ADPE Data} +\format{ +A data.frame with 21 rows and 8 columns. +} +\usage{ +tplyr_adpe +} +\description{ +A mock-up dataset that is fit for testing data limiting +} +\keyword{datasets} diff --git a/tests/testthat/_snaps/count.md b/tests/testthat/_snaps/count.md index 6045b796..01d31a37 100644 --- a/tests/testthat/_snaps/count.md +++ b/tests/testthat/_snaps/count.md @@ -615,3 +615,27 @@ var1_54_F var1_54_M var1_81_F var1_81_M 1 27 (54.0) [50] 23 (67.6) [34] 17 (42.5) [40] 26 (59.1) [44] +# Error checking for add_missing_subjects_row() + + Argument `fmt` does not inherit "f_str". Classes: character + +--- + + Argument `sort_value` does not inherit "numeric". Classes: character + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + +--- + + Argument `missing_subjects_row_label` must be character. Instead a class of "numeric" was passed. + +--- + + length(missing_subjects_row_label) not equal to 1 + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + diff --git a/tests/testthat/test-count.R b/tests/testthat/test-count.R index b4d929f2..c8fc41ce 100644 --- a/tests/testthat/test-count.R +++ b/tests/testthat/test-count.R @@ -945,3 +945,163 @@ test_that("Regression test to make sure cols produce correct denom", { expect_snapshot(t) }) + +test_that("Error checking for add_missing_subjects_row()", { + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row("blah") + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row(f_str("xx", distinct_n), sort_value = "x") + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + }, "`add_missing_subjects_row` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + ) + + ## ---- + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(3) + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(c("x", "y")) + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + set_missing_subjects_row_label("x") + )}, "`set_missing_subjects_row_label` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + set_missing_subjects_row_label("x") + ) + ) + +}) + +test_that("Missing subjects row calculates correctly", { + x <- tplyr_table(tplyr_adlb, TRTA, cols=SEX) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(ANRIND, by = vars(PARAM, AVISIT)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) %>% + build() + + # Check 1 + in_res1 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Blood Urea Nitrogen (mmol/L)", row_label2 == "Week 12") %>% + pull(var1_Placebo_F) %>% + as.numeric() + + pop1 <- tplyr_adsl %>% + filter(TRT01A == "Placebo", SEX == "F") %>% + nrow() + + dat1 <- tplyr_adlb %>% + filter(PARAM == "Blood Urea Nitrogen (mmol/L)", AVISIT == "Week 12", TRTA == "Placebo", SEX == "F") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop1-dat1, in_res1) + + # Check 2 + in_res2 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Gamma Glutamyl Transferase (U/L)", row_label2 == "Week 24") %>% + pull(`var1_Xanomeline Low Dose_M`) %>% + as.numeric() + + pop2 <- tplyr_adsl %>% + filter(TRT01A == "Xanomeline Low Dose", SEX == "M") %>% + nrow() + + dat2 <- tplyr_adlb %>% + filter(PARAM == "Gamma Glutamyl Transferase (U/L)", AVISIT == "Week 24", TRTA == "Xanomeline Low Dose", SEX == "M") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop2-dat2, in_res2) + +}) + +test_that("Missing counts on nested count layers function correctly", { + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) %>% + build() + + expect_equal(nrow(x %>% filter(row_label2 == " Missing")), 1) + expect_equal(tail(x, 1)$ord_layer_2, Inf) + + # Verify that bycount works for missing values and sort value is assigned correctly + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) + ) %>% + build() + + expect_equal(tail(x, 1)$ord_layer_2, 99999) + + # Also test that label reassignment flows + x <- tplyr_table(tplyr_adsl, TRT01A) %>% + add_layer( + group_count(vars(SEX, RACE)) %>% + set_order_count_method(c("byfactor", "byvarn")) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) %>% + set_missing_subjects_row_label("New label") + ) %>% + build() + + expect_equal(filter(x, row_label2 == " New label")$ord_layer_2, c(99999, 99999)) +}) diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R new file mode 100644 index 00000000..1f0e2bae --- /dev/null +++ b/tests/testthat/test-set_limit_data_by.R @@ -0,0 +1,138 @@ +test_that("Descriptive statistics data limiting works properly", { + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(18, 18, 18, 18)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(6, 18, 6, 18)) + + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(6, 18, 18)) +}) + + +test_that("Shift layers can also handle data limiting", { + + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) + +}) + + +test_that("Count data limiting works properly", { + + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) +}) + + +test_that("Nested count layers limit data accurately", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AEOUT, AEDECOD) + ) + + t_ae_df1 <- t_ae1 %>% + build() %>% select(-starts_with('ord')) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AESEV, AEOUT, AEDECOD) + ) + + t_ae_df2 <- t_ae2 %>% + build() %>% select(-starts_with('ord')) + + dropped_rows <- anti_join( + t_ae_df1, + t_ae_df2, + by=names(t_ae_df1) + ) + + check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) + expect_true(all(check == " 0 ( 0.0%)")) +}) diff --git a/tests/testthat/test-sort.R b/tests/testthat/test-sort.R index c77458dd..e32fa740 100644 --- a/tests/testthat/test-sort.R +++ b/tests/testthat/test-sort.R @@ -264,3 +264,39 @@ test_that("by variables get sorted with varn/factors in the correct order", { expect_equal(t3[["row_label1"]], c("1", "1", "1", "0", "0", "0")) expect_equal(t3[["ord_layer_1"]], c(1, 1, 1, 2, 2, 2)) }) + + +# Added to address #175 +test_that("Nested counts with by variables process properly", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) + %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df1 <- t_ae1 %>% + build() + + # This is verifying that the right number of combinations of row_labels exist, and that + # there aren't duplicate order values for the outer layer + expect_equal(nrow(dplyr::count(t_ae_df1, row_label2, row_label3, ord_layer_3)), 6) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars("testing", AEDECOD), by=AEOUT) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df2 <- t_ae2 %>% + build() + + # Same test but now working with a text outer layer and one by variable + expect_equal(nrow(dplyr::count(t_ae_df2, row_label2, ord_layer_2)), 2) + +}) diff --git a/vignettes/denom.Rmd b/vignettes/denom.Rmd index af221fa5..cb657c4e 100644 --- a/vignettes/denom.Rmd +++ b/vignettes/denom.Rmd @@ -233,6 +233,32 @@ t %>% We did one more other thing worth explaining in the example above - gave the missing count its own sort value. If you leave this field null, it will simply be the maximum value in the order layer plus 1, to put the Missing counts at the bottom during an ascending sort. But tables can be sorted a lot of different ways, as you'll see in the sort vignette. So instead of trying to come up with novel ways for you to control where the missing row goes - we decided to just let you specify your own value. +## Missing Subjects + +Missing counts and counting missing subjects work two different ways within Tplyr. Missing counts, as described above, will examine the records present in the data and collect and missing values. But for these results to be counted, they need to first be provided within the input data itself. On the other hand, missing subjects are calculated by looking at the difference between the potential number of subjects within the column (i.e. the combination of the treatment variables and column variables) and the number of subjects actually present. Consider this example: + +```{r missing_subs1} + missing_subs <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_nest_count(TRUE) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) %>% + set_missing_subjects_row_label("Missing Subjects") + ) %>% + build() + + tail(missing_subs) %>% + select(-starts_with('ord')) %>% + kable() +``` + +In the example above, we produce a nested count layer. The function `add_missing_subjects_row()` triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default to 'Missing'. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. + +Note that in nested count layers, missing subject rows will generate for each independent group within the outer layer. Outer layers cannot have missing subject rows calculated individually. This would best be done in an independent layer itself, as the result would apply to the whole input target dataset. + ## Adding a 'Total' Row In addition to missing counts, some summaries require the addition of a 'Total' row. **Tplyr** has the helper function `add_total_row()` to ease this process for you. Like most other things within **Tplyr** - particularly in this vignette - this too has a significant bit of nuance to it. diff --git a/vignettes/table.Rmd b/vignettes/table.Rmd index ae9c9a82..bad21d72 100644 --- a/vignettes/table.Rmd +++ b/vignettes/table.Rmd @@ -115,6 +115,58 @@ header_n(t) %>% Note: it’s expected the `set_distinct_by()` function is used with population data. This is because it does not make sense to use population data denominators unless you have distinct counts. The entire point of population data is to use subject counts, so non-distinct counts would potentially count multiple records per subject and then the percentage doesn’t make any sense. +## Data Completion + +When creating summary tables, often we have to mock up the potential values of data, even if those values aren't present in the data we're summarizing. **Tplyr** does its best effort to do this for you. Let's consider the following dataset: + +```{r data_comp, echo=FALSE} +kable(head(tplyr_adpe)) +``` +Let's say we want to create a count summary for this dataset, and report it by PARAM and AVISIT. Note that in the data, `PARAM=="HEAD"` is only collected at screening, while `LUNGS` is collected at Screening, Day -1, and Day 5. + +```{r data_comp1} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(18) %>% + kable() + +``` + +By default, given the `by` variables of PARAM and AVISIT, all of the potential visits have dummy rows created that are 0 filled - meaning results of 0 records for all treatment groups are presented. However, that might not be what you wish to present. Perhaps `HEAD` was only intended to be collected at the Screening visit so it's unnecessary to present other visits. To address this, you can use the `set_limit_data_by()` function. + +```{r data_comp2} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(12) %>% + kable() +``` + +Here you can see that now records for `HEAD` only present the screening visit. For count and shift layers, you can additionally dig further in to use target variables: + +```{r data_comp3} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT, AVALC) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + kable() +``` + +This effectively limits to the values present in the data itself. + +## Where to Go From Here + With the table level settings under control, now you're ready to learn more about what **Tplyr** has to offer in each layer. - Learn more about descriptive statistics layers in `vignette("desc")`