diff --git a/.Rbuildignore b/.Rbuildignore index 9da721c8..c2d17038 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -22,3 +22,6 @@ ^Makefile$ ^Jenkinsfile$ ^rsconnect$ +^data-raw$ +^scratch.R$ +^CRAN-SUBMISSION$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 15560054..1e93f191 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -5,10 +5,12 @@ on: branches: - main - master + - devel pull_request: branches: - main - master + - devel name: R-CMD-check diff --git a/.gitignore b/.gitignore index 3c628ab0..83d4b60f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ inst/doc Tplyr.Rproj docs/ +scratch.R diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c2c05e47..00000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: r - -sudo: required - -env: _R_CHECK_CRAN_INCOMING_=FALSE - -r_packages: -- covr -- devtools - -after_success: - - Rscript -e 'library(covr); codecov()' diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 00000000..a964860a --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 1.2.0 +Date: 2024-02-14 17:07:48 UTC +SHA: 806f9a0a103059542437632f5977cc1e8ded2652 diff --git a/DESCRIPTION b/DESCRIPTION index 0e7ccb87..fec79b17 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Tplyr Title: A Traceability Focused Grammar of Clinical Data Summary -Version: 1.1.0 +Version: 1.2.1 Authors@R: c( person(given = "Eli", @@ -26,6 +26,18 @@ Authors@R: family = "Mascary", email = "sadchla.mascary@atorusresearch.com", role = "ctb"), + person(given = "Andrew", + family = "Bates", + email = "andrew.bates@atorusresearch.com", + role = "ctb"), + person(given = "Shiyu", + family = "Chen", + email = "shiyu.chen@atorusresearch.com", + role = "ctb"), + person(given = "Oleksii", + family = "Mikryukov", + email = "alex.mikryukov@atorusresearch.com", + role = "ctb"), person(given = "Atorus Research LLC", role = "cph") ) @@ -62,3 +74,4 @@ VignetteBuilder: knitr RoxygenNote: 7.2.3 RdMacros: lifecycle Config/testthat/edition: 3 +LazyData: true diff --git a/NAMESPACE b/NAMESPACE index 5411b68d..b3b291b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,10 @@ S3method(set_denoms_by,count_layer) S3method(set_denoms_by,shift_layer) S3method(set_format_strings,count_layer) S3method(set_format_strings,desc_layer) +S3method(set_format_strings,shift_layer) +S3method(set_limit_data_by,count_layer) +S3method(set_limit_data_by,desc_layer) +S3method(set_limit_data_by,shift_layer) S3method(set_where,tplyr_layer) S3method(set_where,tplyr_table) S3method(str,f_str) @@ -40,10 +44,12 @@ S3method(str,tplyr_table) export("%>%") export("header_n<-") export("pop_data<-") +export(add_anti_join) export(add_column_headers) export(add_filters) export(add_layer) export(add_layers) +export(add_missing_subjects_row) export(add_risk_diff) export(add_total_group) export(add_total_row) @@ -54,9 +60,11 @@ export(apply_conditional_format) export(apply_formats) export(apply_row_masks) export(build) +export(collapse_row_labels) export(f_str) export(get_by) export(get_count_layer_formats) +export(get_data_labels) export(get_desc_layer_formats) export(get_layer_template) export(get_layer_templates) @@ -86,6 +94,7 @@ export(process_statistic_data) export(process_statistic_formatting) export(process_summaries) export(remove_layer_template) +export(replace_leading_whitespace) export(set_by) export(set_count_layer_formats) export(set_custom_summaries) @@ -97,7 +106,9 @@ export(set_distinct_by) export(set_format_strings) export(set_header_n) export(set_indentation) +export(set_limit_data_by) export(set_missing_count) +export(set_missing_subjects_row_label) export(set_nest_count) export(set_numeric_threshold) export(set_order_count_method) @@ -135,8 +146,10 @@ importFrom(dplyr,between) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,count) importFrom(dplyr,cur_column) importFrom(dplyr,cur_group) +importFrom(dplyr,desc) importFrom(dplyr,distinct) importFrom(dplyr,do) importFrom(dplyr,everything) @@ -159,10 +172,12 @@ importFrom(dplyr,rename) importFrom(dplyr,row_number) importFrom(dplyr,rowwise) importFrom(dplyr,select) +importFrom(dplyr,slice_head) importFrom(dplyr,summarize) importFrom(dplyr,tally) importFrom(dplyr,ungroup) importFrom(dplyr,vars) +importFrom(dplyr,where) importFrom(forcats,fct_collapse) importFrom(forcats,fct_drop) importFrom(forcats,fct_expand) @@ -176,6 +191,7 @@ importFrom(purrr,flatten) importFrom(purrr,imap) importFrom(purrr,map) importFrom(purrr,map2) +importFrom(purrr,map2_chr) importFrom(purrr,map2_dfr) importFrom(purrr,map_chr) importFrom(purrr,map_dbl) @@ -194,7 +210,6 @@ importFrom(rlang,as_name) importFrom(rlang,call_args) importFrom(rlang,call_modify) importFrom(rlang,call_name) -importFrom(rlang,call_standardise) importFrom(rlang,caller_env) importFrom(rlang,current_env) importFrom(rlang,enexpr) @@ -220,6 +235,7 @@ importFrom(rlang,is_logical) importFrom(rlang,is_named) importFrom(rlang,is_quosure) importFrom(rlang,is_quosures) +importFrom(rlang,list2) importFrom(rlang,quo) importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_call) @@ -245,6 +261,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_locate_all) +importFrom(stringr,str_match) importFrom(stringr,str_match_all) importFrom(stringr,str_pad) importFrom(stringr,str_remove_all) diff --git a/NEWS.md b/NEWS.md index 2482e4c0..6bf7d521 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,21 @@ +# Tplyr 1.2.1 +- Resolve #178 to add metadata handling for missing subjects, and add the `add_anti_join()` function + +# Tplyr 1.2.0 +- Resolve #62 Add data vignette data into the package (thanks for the suggestion @thebioengineer) +- Resolve #74 Add an example of piping in set_pop_data +- Resolve #83 Add the `add_missing_subjects()` function +- Resolve #84 Add `set_limit_data_by()` function +- Resolve #111, #148 Allow ellipsis argument unpacking outside of `add_layer()` +- Resolve #129 Add `collapse_row_labels()` function +- Resolve #134 Clarify how population data works to trigger denominators +- Resolve #75, #146, #166 Fix nested count layer handling where one inner layer value exists in multiple outer layer groups +- Resolve #21, #152 Fix handling of Inf, -Inf in desc layer for min and max +- Resolve #154 Fix namespace scoping for execution of Tplyr tables within non-global environments +- Resolve #155 Dead code clean-up +- Resolve #170 Add `replace_leading_whitespace()` post-processing function +- Resolve #173 Fix nested count layer sort variable behavior when using by variables + # Tplyr 1.1.0 - This release incorporate parenthesis hugging across all layers (#117) - New functions `apply_conditional_formats()`, `str_extract_fmt_group()` and `str_extract_num()` diff --git a/R/apply_conditional_format.R b/R/apply_conditional_format.R index 16198fa1..54583b07 100644 --- a/R/apply_conditional_format.R +++ b/R/apply_conditional_format.R @@ -11,15 +11,15 @@ #' @noRd validate_conditional_format_params <- function(string, format_group, condition, replacement, full_string) { if (!inherits(string, "character")) { - stop("Paramter `string` must be a character vector", call.=FALSE) + stop("Parameter `string` must be a character vector", call.=FALSE) } if (!inherits(format_group, "numeric") || (inherits(format_group, "numeric") && format_group %% 1 != 0)) { - stop("Paramter `format_group` must be an integer", call.=FALSE) + stop("Parameter `format_group` must be an integer", call.=FALSE) } if (!inherits(replacement, "character")) { - stop("Paramter `replacement` must be a string", call.=FALSE) + stop("Parameter `replacement` must be a string", call.=FALSE) } # Condition statement must use the variable name 'x' @@ -28,7 +28,7 @@ validate_conditional_format_params <- function(string, format_group, condition, } if (!inherits(full_string, "logical")) { - stop("Paramter `full_string` must be bool", call.=FALSE) + stop("Parameter `full_string` must be bool", call.=FALSE) } } diff --git a/R/apply_formats.R b/R/apply_formats.R index 38f9b5eb..179d23e3 100644 --- a/R/apply_formats.R +++ b/R/apply_formats.R @@ -38,7 +38,7 @@ apply_formats <- function(format_string, ..., empty = c(.overall = "")) { call.=FALSE) } - pmap_chr(list(...), function(...) apply_fmts(...), fmt=format) + pmap_chr(list2(...), function(...) apply_fmts(...), fmt=format) } #' Application of individual format string @@ -53,7 +53,7 @@ apply_formats <- function(format_string, ..., empty = c(.overall = "")) { #' @return An individually formatted string #' @noRd apply_fmts <- function(..., fmt) { - nums <- list(...) + nums <- list2(...) repl <- vector('list', length(fmt$settings)) for (i in seq_along(fmt$settings)) { repl[[i]] <- num_fmt(nums[[i]], i, fmt=fmt) diff --git a/R/assertions.R b/R/assertions.R index ba108032..4e36c217 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -42,22 +42,7 @@ assert_has_class <- function(x, should_be) { # Is the argument the class that it should be? if (class(x) != should_be){ - # Grab the trace back into an object - trc <- trace_back() - # Look at the length of the traceback - max_length <- length(trc$calls) - # If it's >1 we're inside a function, so grab the name - if (max_length > 1){ - # Pull the name out of the call stack - cname <- call_name(trc$calls[[max_length - 1]]) - # Make a display string - func_str <- paste0('` in function `', cname, '`') - } else { - # Filler - func_str <- '`' - } - # Abort and show error - abort(paste0('Argument `', param, func_str, ' must be ', + abort(paste0('Argument `', param, '` must be ', should_be, '. Instead a class of "', class(x), '" was passed.')) } @@ -75,24 +60,9 @@ assert_inherits_class <- function(x, should_have) { # Is the argument the class that it should be? if (!inherits(x, should_have)){ - - # Grab the trace back into an object - trc <- trace_back() - # Look at the length of the traceback - max_length <- max(trc$indices) - # If it's >1 we're innside a function, so grab the name - if (max_length > 1){ - # Pull the name out of the call stack - cname <- call_name(trc$calls[[max_length - 1]]) - # Make a display string - func_str <- paste0('` in function `', cname, '`') - } else { - # Filler - func_str <- '`' - } # Abort and show error - abort(paste0('Argument `', param, func_str, - ' does not inherit "', should_have, + abort(paste0('Argument `', param, + '` does not inherit "', should_have, '". Classes: ', paste(class(x), collapse=", "))) } } @@ -197,15 +167,6 @@ unpack_vars <- function(quo_list, allow_character=TRUE) { quo_list } -#' Check if a quosure is null or contains a call -#' -#' @param quo_var A quosure object to check -#' -#' @noRd -is_null_or_call <- function(quo_var) { - quo_is_null(quo_var) || inherits(quo_get_expr(quo_var), "call") -} - #' Check if a quosure is null or contains a logical value #' #' @param quo_var A quosure object to check @@ -222,14 +183,6 @@ assert_is_layer <- function(object) { assert_inherits_class(object, "tplyr_layer") } -#' @param object Object to check if its a layer -#' -#' @noRd -assert_is_table <- function(object) { - assert_inherits_class(object, "tplyr_table") -} - - #' Return the class of the expression inside a quosure #' #' @param q A quosure diff --git a/R/collapse_row_labels.R b/R/collapse_row_labels.R new file mode 100644 index 00000000..d4a7e547 --- /dev/null +++ b/R/collapse_row_labels.R @@ -0,0 +1,122 @@ +#' Add indentation level based +#' +#' @param .x The number of levels to indent +#' @param .y Input variable for which indentation will be done +#' @param indent Indentation string to be used, which is multiplied at each indentation level +#' +#' @return Character string with indentation applied +#' @noRd +add_indentation <- function(.x, .y, indent = " ") { + paste(c(rep("",.x-1), .y), collapse=indent) +} + + +#' Collapse row labels into a single column +#' +#' This is a generalized post processing function that allows you to take groups +#' of by variables and collapse them into a single column. Repeating values are +#' split into separate rows, and for each level of nesting, a specified +#' indentation level can be applied. +#' +#' @param x Input data frame +#' @param ... Row labels to be collapsed +#' @param indent Indentation string to be used, which is multiplied at each indentation level +#' @param target_col The desired name of the output column containing collapsed row labels +#' +#' @return data.frame with row labels collapsed into a single column +#' @export +#' +#' @examples +#' x <- tibble::tribble( +#' ~row_label1, ~row_label2, ~row_label3, ~row_label4, ~var1, +#' "A", "C", "G", "M", 1L, +#' "A", "C", "G", "N", 2L, +#' "A", "C", "H", "O", 3L, +#' "A", "D", "H", "P", 4L, +#' "A", "D", "I", "Q", 5L, +#' "A", "D", "I", "R", 6L, +#' "B", "E", "J", "S", 7L, +#' "B", "E", "J", "T", 8L, +#' "B", "E", "K", "U", 9L, +#' "B", "F", "K", "V", 10L, +#' "B", "F", "L", "W", 11L +#' ) +#' +#' +#' collapse_row_labels(x, row_label1, row_label2, row_label3, row_label4) +#' +#' collapse_row_labels(x, row_label1, row_label2, row_label3) +#' +#' collapse_row_labels(x, row_label1, row_label2, indent = " ", target_col = rl) +#' +collapse_row_labels <- function(x, ..., indent = " ", target_col=row_label) { + + target_col = enquo(target_col) + dots <- enquos(...) + + # browser() + dot_names <- map_chr(dots, as_label) + + if (!inherits(x, 'data.frame')) { + stop('x must be a data frame', call.=FALSE) + } + + if (!inherits(indent, 'character')) { + stop("indent must be a character string", call.=FALSE) + } + + if (!all(map_lgl(dots, quo_is_symbol))) { + stop("Columns provided to dots must be provided as unquoted symbols.", call.=FALSE) + } + + if (!all(dot_names %in% names(x))) { + stop("Columns provided to dots are missing from x.", call.=FALSE) + } + + if (!quo_is_symbol(target_col)) { + stop("target_col must be provided as an unquoted symbol.", call.=FALSE) + } + + if (length(dots) < 2) { + stop("Must have two or more columns to collapse", call.=FALSE) + } + + all_but_last <- dots[1:length(dots)-1] + + # Add the original row identifier + x['og_row'] <- as.numeric(rownames(x)) + + # Grab the desired rowlabels, except for the last one specified in the dots + rowlabs <- select(x, !!!all_but_last, og_row) + + # Get the distinct list of stubs from the data and grab the nesting level + stubs <- rowlabs %>% + group_by(!!!all_but_last) %>% + slice_head() %>% + pivot_longer( + map_chr(all_but_last, as_label), + names_to = NULL, + values_to = as_label(target_col) + ) %>% + group_by(og_row) %>% + mutate( + stub_sort = row_number() + ) + + # Join back to the original data + x %>% + bind_rows(stubs, .id="id") %>% + # Put everything into the right spot + arrange(og_row, desc(id)) %>% + fill(stub_sort) %>% + mutate( + # Figure out the indentation level + stub_sort = if_else(id == 1, stub_sort + 1, stub_sort), + # Build and indent the new row label column + !!target_col := if_else(is.na(!!target_col), !!!tail(dots, 1), !!target_col), + !!target_col := map2_chr(stub_sort, !!target_col, add_indentation, indent = indent), + # Fill in the empty character fields + across(where(is.character), ~ replace_na(., '')) + ) %>% + select(!!target_col, !c(id, og_row, stub_sort, !!!dots)) +} diff --git a/R/count.R b/R/count.R index 030db51d..49a6e15f 100644 --- a/R/count.R +++ b/R/count.R @@ -39,6 +39,13 @@ process_summaries.count_layer <- function(x, ...) { sep = " "))) } + # Do this here to make sure that defaults are available everywhere else + # Downstream + if (is.null(include_total_row)) include_total_row <- FALSE + if (is.null(total_row_label)) total_row_label <- "Total" + if (is.null(include_missing_subjects_row)) include_missing_subjects_row <- FALSE + if (is.null(missing_subjects_row_label)) missing_subjects_row_label <- "Missing" + # Save this for the denominator where, but only if it hasn't been saved yet. if (is.null(built_target_pre_where)) built_target_pre_where <- built_target @@ -121,15 +128,12 @@ process_summaries.count_layer <- function(x, ...) { #' If include_total_row is true a row will be added with a total row labeled #' with total_row_label. #' -#' Complete is used to complete the combinaions of by, treat_var, and target_var +#' Complete is used to complete the combinations of by, treat_var, and target_var #' #' @noRd process_single_count_target <- function(x) { evalq({ - if (is.null(include_total_row)) include_total_row <- FALSE - if (is.null(total_row_label)) total_row_label <- "Total" - # The current environment should be the layer itself process_count_n(current_env()) @@ -150,6 +154,10 @@ process_single_count_target <- function(x) { } } + if (include_missing_subjects_row) { + process_missing_subjects_row(current_env()) + } + if (is.null(count_row_prefix)) count_row_prefix <- "" # If a denoms variable is factor then it should be character for the denoms calculations @@ -186,26 +194,13 @@ process_single_count_target <- function(x) { } # rbind tables together - numeric_data <- summary_stat %>% - bind_rows(total_stat) %>% + numeric_data <- bind_rows(summary_stat, total_stat, missing_subjects_stat) %>% rename("summary_var" = !!target_var[[1]]) %>% group_by(!!!denoms_by) %>% do(get_denom_total(., denoms_by, denoms_df_prep, "n")) %>% mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>% ungroup() - - if (!is.null(distinct_stat)) { - if (include_total_row) { - distinct_stat <- distinct_stat %>% - bind_rows(total_stat_denom) %>% - group_by(!!!denoms_by) %>% - do(get_denom_total(., denoms_by, denoms_df, "distinct_n")) - } - numeric_data <- bind_cols(numeric_data, - distinct_stat[, c("distinct_n", "distinct_total")]) - } - rm(denoms_df_prep, fct_cols) }, envir = x) @@ -256,11 +251,15 @@ process_count_n <- function(x) { names(missing_count_list))) } - summary_stat <- summary_stat %>% - # complete all combinations of factors to include combinations that don't exist. - # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!target_var, !!!cols, - fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0)) %>% + # Need to mark this for nested counts + if (!exists('outer_')) outer_ <- FALSE + + complete_levels <- summary_stat %>% + complete_and_limit(treat_var, by, cols, target_var, limit_data_by, + .fill = list(n = 0, total = 0, distinct_n = 0, distinct_total = 0), + outer=outer_) + + summary_stat <- complete_levels %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% @@ -275,6 +274,25 @@ process_count_n <- function(x) { } + +#' Get Logical vector that is used to remove the treat_var and cols +#' +#' In total row and missing subject counts, denoms_by needs to be stripped of +#' cols and treat_var variables, otherwise it will error out in the group_by +#' +#' @param denoms_by The layer denoms by +#' @param treat_var table treat var +#' @param cols tables cols vars +#' +#' @return list of quosures +#' @noRd +get_needed_denoms_by <- function(denoms_by, treat_var, cols) { + map_lgl(denoms_by, function(x, treat_var, cols) { + all(as_name(x) != as_name(treat_var), + as_name(x) != map_chr(cols, as_name)) + }, treat_var, cols) +} + #' Process the amounts for a total row #' #' @param x A Count layer @@ -290,16 +308,8 @@ no denoms_by variable was set. This may cause unexpected results. If you wish to change this behavior, use `set_denoms_by()`.", immediate. = TRUE) } - # Make sure the denoms_by is stripped - # Stripped of cols and treat_var variables, otherwise it will error out in the group_by - # I thought of replacing the group by with !!!unique(c(treat_var, cols, denoms_by)) - # but that doesn't work due to the denoms_by having an environment set - # Logical vector that is used to remove the treat_var and cols - needed_denoms_by <- map_lgl(denoms_by, function(x, treat_var, cols) { - all(as_name(x) != as_name(treat_var), - as_name(x) != map_chr(cols, as_name)) - }, treat_var, cols) + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) #Create an expression to evaluate filter if (!count_missings) { @@ -326,9 +336,46 @@ change this behavior, use `set_denoms_by()`.", immediate. = TRUE) # aren't symbols group_by(!!!extract_character_from_quo(by)) %>% # ungroup right away to make sure the complete works + ungroup() + }, envir = x) +} + +#' Process the amounts for a missing subjects row +#' +#' @param x A Count layer +#' @noRd +process_missing_subjects_row <- function(x) { + evalq({ + + # Logical vector that is used to remove the treat_var and cols + needed_denoms_by <- get_needed_denoms_by(denoms_by, treat_var, cols) + + # Create the merge variables to join the header_n data + mrg_vars <- map_chr(c(pop_treat_var, cols, denoms_by[needed_denoms_by]), as_name) + names(mrg_vars)[1] <- as_name(treat_var) + # create a data.frame to create total counts + missing_subjects_stat <- built_target %>% + # Use distinct if this is a distinct total row + # Group by all column variables + distinct(!!treat_var, !!!cols, !!!by, !!!distinct_by) %>% ungroup() %>% + count(!!treat_var, !!!cols, !!!by, name="n_present") %>% # complete based on missing groupings - complete(!!treat_var, !!!cols, fill = list(n = 0, total = 0)) + complete(!!treat_var, !!!cols, !!!by, fill = list(n_present = 0)) %>% + left_join( + header_n %>% rename(header_tots = n), by = mrg_vars + ) %>% + # Create a variable to label the totals when it is merged in. + mutate( + !!as_name(target_var[[1]]) := missing_subjects_row_label, + distinct_n = header_tots - n_present + ) %>% + # Create variables to carry forward 'by'. Only pull out the ones that + # aren't symbols + group_by(!!!extract_character_from_quo(by)) %>% + # ungroup right away to make sure the complete works + ungroup() %>% + select(-c(n_present, header_tots)) }, envir = x) } @@ -418,7 +465,9 @@ process_formatting.count_layer <- function(x, ...) { summary_var = summary_var, indentation_length = indentation_length, total_count_format = total_count_format, + missing_subjects_count_format = missing_subjects_count_format, total_row_label = total_row_label, + missing_subjects_row_label = missing_subjects_row_label, has_missing_count = has_missing_count) }) %>% # Pivot table @@ -479,19 +528,26 @@ process_formatting.count_layer <- function(x, ...) { #' target variable. #' @param indentation_length If this is a nested count layer. The row prefixes #' must be removed +#' @param total_count_format f_str for total counts +#' @param missing_subjects_count_format f_str for missing subjects +#' @param total_row_label Label string for total rows +#' @param missing_subjects_row_label Label string for missing subjects +#' @param has_missing_count Boolean for if missing counts are present #' #' @return A tibble replacing the original counts #' @noRd construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_total = NULL, count_fmt = NULL, max_layer_length, max_n_width, missing_string, missing_f_str, summary_var, indentation_length, total_count_format, - total_row_label, has_missing_count) { + missing_subjects_count_format, total_row_label, missing_subjects_row_label, + has_missing_count) { ## Added this for processing formatting in nested count layers where this won't be processed yet if (is.null(max_layer_length)) max_layer_length <- 0 if (is.null(max_n_width)) max_n_width <- 0 missing_rows <- FALSE total_rows <- FALSE + missing_subject_rows <- FALSE # Add in the missing format if its null and there are missing counts if (has_missing_count && is.null(missing_f_str)) { @@ -515,6 +571,12 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord <- map_chr(total_count_format$vars, as_name) } + ## Pull out string information for missing subject rows + if (!is.null(missing_subjects_count_format)) { + missing_subject_rows <- summary_var %in% missing_subjects_row_label + missing_subject_vars_ord <- map_chr(missing_subjects_count_format$vars, as_name) + } + vars_ord <- map_chr(count_fmt$vars, as_name) # str_all is a list that contains character vectors for each parameter that might be calculated @@ -522,12 +584,16 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot # Append the repl_str to be passed to do.call str_all[1] <- count_fmt$repl_str # Iterate over every variable + rows_ <- !missing_rows & !total_rows & !missing_subject_rows for (i in seq_along(vars_ord)) { - str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, .n[!missing_rows & !total_rows], .total[!missing_rows & !total_rows], - .distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord) + str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, + .n[rows_], + .total[rows_], + .distinct_n[rows_], + .distinct_total[rows_], + vars_ord) } - # Logic for missing # Same logic as above, just add for missing missing_str_all <- vector("list", 5) @@ -554,20 +620,30 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot total_vars_ord) } + missing_subs_str_all <- vector("list", 5) + missing_subs_str_all[1] <- missing_subjects_count_format$repl_str + for (i in seq_along(missing_subject_vars_ord)) { + missing_subs_str_all[[i + 1]] <- count_string_switch_help(missing_subject_vars_ord[i], + missing_subjects_count_format, + .n[missing_subject_rows], + .total[missing_subject_rows], + .distinct_n[missing_subject_rows], + .distinct_total[missing_subject_rows], + missing_subject_vars_ord) + } + # Put the vector strings together. Only include parts of str_all that aren't null - # nm is non-missing, m is mising, and t is total. + # nm is non-missing, m is missing, t is total, ms is missing subjects string_nm <- do.call(sprintf, str_all[!map_lgl(str_all, is.null)]) if (!is.null(missing_vars_ord)) string_m <- do.call(sprintf, missing_str_all[!map_lgl(missing_str_all, is.null)]) if (!is.null(total_vars_ord)) string_t <- do.call(sprintf, total_str_all[!map_lgl(total_str_all, is.null)]) + if (!is.null(missing_subject_vars_ord)) string_ms <- do.call(sprintf, missing_subs_str_all[!map_lgl(missing_subs_str_all, is.null)]) # string_ is the final string to return. Merge the missing, non-missing, and others together - string_ <- character(length(string_nm) + length(string_m) + length(string_t)) - string_[!missing_rows & !total_rows] <- string_nm + string_ <- character(sum(length(string_nm), length(string_m), length(string_t), length(string_ms))) + string_[rows_] <- string_nm string_[total_rows] <- string_t string_[missing_rows] <- string_m - - - - + string_[missing_subject_rows] <- string_ms # Left pad set to 0 meaning it won't pad to the left at all # right pad is set to the maximum n count in the table string_ <- pad_formatted_data(string_, 0, max_n_width) diff --git a/R/count_bindings.R b/R/count_bindings.R index 98cecd75..4dd1603c 100644 --- a/R/count_bindings.R +++ b/R/count_bindings.R @@ -12,6 +12,10 @@ #' use \code{set_denoms_by()}, and the grouping of \code{add_total_row()} will #' be updated accordingly. #' +#' Note that when using \code{add_total_row()} with \code{set_pop_data()}, you +#' should call \code{add_total_row()} AFTER calling \code{set_pop_data()}, +#' otherwise there is potential for unexpected behaivior with treatment groups. +#' #' @param e A \code{count_layer} object #' @param fmt An f_str object used to format the total row. If none is provided, #' display is based on the layer formatting. @@ -457,7 +461,7 @@ set_result_order_var <- function(e, result_order_var) { #' build() set_missing_count <- function(e, fmt = NULL, sort_value = NULL, denom_ignore = FALSE, ...) { - missings <- list(...) + missings <- list2(...) assert_that(length(missings) > 0, msg = "No missing values were specified.") if(!is.null(fmt)) assert_inherits_class(fmt, "f_str") @@ -576,6 +580,10 @@ set_outer_sort_position <- function(e, outer_sort_position) { set_denom_where <- function(e, denom_where) { denom_where <- enquo(denom_where) + if (!(inherits(e, 'tplyr_layer') | inherits(e, 'tplyr_table'))) { + stop('Object type should be either "tplyr_layer" or "tplyr_table"', call.=FALSE) + } + assert_that(is_logical_or_call(denom_where), msg = "The `where` parameter must contain subsetting logic (enter without quotes)") @@ -644,7 +652,7 @@ set_denoms_by.count_layer <- function(e, ...) { #' build() #' keep_levels <- function(e, ...) { - dots <- list(...) + dots <- list2(...) assert_that(all(map_lgl(dots, is.character)), msg = "must pass character values to `keep_levels`") @@ -703,3 +711,81 @@ set_numeric_threshold <- function(e, numeric_cutoff, stat, column = NULL) { e } + +#' Add a missing subject row into a count summary. +#' +#' This function calculates the number of subjects missing from a particular +#' group of results. The calculation is done by examining the total number of +#' subjects potentially available from the Header N values within the result +#' column, and finding the difference with the total number of subjects present +#' in the result group. Note that for accurate results, the subject variable +#' needs to be defined using the `set_distinct_by()` function. As with other +#' methods, this function instructs how distinct results should be identified. +#' +#' @param e A `count_layer` object +#' @param fmt An f_str object used to format the total row. If none is provided, +#' display is based on the layer formatting. +#' @param sort_value The value that will appear in the ordering column for total +#' rows. This must be a numeric value. +#' +#' @export +#' @examples +#' +#' tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row(f_str("xxxx", n)) +#' ) %>% +#' build() +add_missing_subjects_row <- function(e, fmt = NULL, sort_value = NULL) { + if(!is.null(fmt)) assert_inherits_class(fmt, "f_str") + if(!is.null(sort_value)) assert_inherits_class(sort_value, "numeric") + if("shift_layer" %in% class(e)) { + rlang::abort("`add_missing_subjects_row` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + if (identical(env_get(env_parent(e), 'target'), env_get(env_parent(e), 'pop_data'))) { + warning(paste("\tPopulation data was not set separately from the target data.", + "\tMissing subject counts may be misleading in this scenario.", + "\tDid you mean to use `set_missing_count() instead?", + sep="\n")) + } + + env_bind(e, include_missing_subjects_row = TRUE) + env_bind(e, missing_subjects_count_format = fmt) + env_bind(e, missing_subjects_sort_value = sort_value) + + e +} + +#' Set the label for the missing subjects row +#' +#' @param e A \code{count_layer} object +#' @param missing_subjects_row_label A character to label the total row +#' +#' @return The modified \code{count_layer} object +#' @export +#' +#' @examples +#' +#' t <- tplyr_table(mtcars, gear) %>% +#' add_layer( +#' group_count(cyl) %>% +#' add_missing_subjects_row() %>% +#' set_missing_subjects_row_label("Missing") +#' ) +#' build(t) +set_missing_subjects_row_label <- function(e, missing_subjects_row_label) { + + assert_has_class(missing_subjects_row_label, "character") + assert_that(length(missing_subjects_row_label) == 1) + if("shift_layer" %in% class(e)) { + rlang::abort("`set_missing_subjects_row_label` for shift layers is not yet supported") + } + assert_inherits_class(e, "count_layer") + + env_bind(e, missing_subjects_row_label = missing_subjects_row_label) + + e +} diff --git a/R/data.R b/R/data.R new file mode 100644 index 00000000..2627ef3f --- /dev/null +++ b/R/data.R @@ -0,0 +1,75 @@ +#' ADSL Data +#' +#' A subset of the PHUSE Test Data Factory ADSL data set. +#' +#' @format A data.frame with 254 rows and 49 columns. +#' +#' @seealso [get_data_labels()] +#' +#' @source https://github.com/phuse-org/TestDataFactory +#' +"tplyr_adsl" + + +#' ADAE Data +#' +#' A subset of the PHUSE Test Data Factory ADAE data set. +#' +#' @format A data.frame with 276 rows and 55 columns. +#' +#' @seealso [get_data_labels()] +#' +#' @source https://github.com/phuse-org/TestDataFactory +#' +"tplyr_adae" + +#' ADAS Data +#' +#' A subset of the PHUSE Test Data Factory ADAS data set. +#' +#' @format A data.frame with 1,040 rows and 40 columns. +#' +#' @seealso [get_data_labels()] +#' +#' @source https://github.com/phuse-org/TestDataFactory +#' +"tplyr_adas" + +#' ADLB Data +#' +#' A subset of the PHUSE Test Data Factory ADLB data set. +#' +#' @format A data.frame with 311 rows and 46 columns. +#' +#' @seealso [get_data_labels()] +#' +#' @source https://github.com/phuse-org/TestDataFactory +#' +"tplyr_adlb" + +#' ADPE Data +#' +#' A mock-up dataset that is fit for testing data limiting +#' +#' @format A data.frame with 21 rows and 8 columns. +#' +#' +"tplyr_adpe" + +#' Get Data Labels +#' +#' Get labels for data sets included in Tplyr. +#' +#' @param data A Tplyr data set. +#' +#' @return A data.frame with columns `name` and `label` containing the names and labels of each column. +#' +#' @export +get_data_labels <- function(data) { + map_dfr( + names(data), + function(name) { + list(name = name, label = attr(data[[name]], "label")) + } + ) +} diff --git a/R/desc.R b/R/desc.R index c6ad4b6c..40e36686 100644 --- a/R/desc.R +++ b/R/desc.R @@ -49,16 +49,16 @@ process_summaries.desc_layer <- function(x, ...) { summaries <- get_summaries()[match_exact(summary_vars)] # Create the numeric summary data - num_sums_raw[[i]] <- built_target %>% + cmplt1 <- built_target %>% # Rename the current variable to make each iteration use a generic name rename(.var = !!cur_var) %>% # Group by treatment, provided by variable, and provided column variables group_by(!!treat_var, !!!by, !!!cols) %>% # Execute the summaries summarize(!!!summaries) %>% - ungroup() %>% - # Fill in any missing treat/col combinations - complete(!!treat_var, !!!by, !!!cols) + ungroup() + + num_sums_raw[[i]] <- complete_and_limit(cmplt1, treat_var, by, cols, limit_data_by=limit_data_by) # Create the transposed summary data to prepare for formatting trans_sums[[i]] <- num_sums_raw[[i]] %>% @@ -188,6 +188,9 @@ process_formatting.desc_layer <- function(x, ...) { env_get(x, "formatted_data") } +# Small helper function to help with builtins +inf_to_na <- function(x) if_else(is.infinite(x), NA, x) + #' Get the summaries to be passed forward into \code{dplyr::summarize()} #' #' @param e the environment summaries are stored in. @@ -203,8 +206,8 @@ get_summaries <- function(e = caller_env()) { sd = sd(.var, na.rm=TRUE), median = median(.var, na.rm=TRUE), var = var(.var, na.rm=TRUE), - min = min(.var, na.rm=TRUE), - max = max(.var, na.rm=TRUE), + min = inf_to_na(min(.var, na.rm=TRUE)), + max = inf_to_na(max(.var, na.rm=TRUE)), iqr = IQR(.var, na.rm=TRUE, type=getOption('tplyr.quantile_type')), q1 = quantile(.var, na.rm=TRUE, type=getOption('tplyr.quantile_type'))[[2]], q3 = quantile(.var, na.rm=TRUE, type=getOption('tplyr.quantile_type'))[[4]], @@ -225,7 +228,7 @@ get_summaries <- function(e = caller_env()) { #' @noRd construct_desc_string <- function(..., .fmt_str=NULL) { # Unpack names into current namespace for ease - list2env(list(...), envir=environment()) + list2env(list2(...), envir=environment()) # Get the current format to be applied fmt <- .fmt_str[[row_label]] diff --git a/R/format.R b/R/format.R index e30100a4..c80abe4c 100644 --- a/R/format.R +++ b/R/format.R @@ -360,7 +360,7 @@ count_f_str_check <- function(...) { } # Grab the named parameters - params <- list(...) + params <- list2(...) # Currently supported format names valid_names <- c("n_counts", "riskdiff") diff --git a/R/layer.R b/R/layer.R index abaea5a9..be165ac5 100644 --- a/R/layer.R +++ b/R/layer.R @@ -101,18 +101,10 @@ as_tplyr_layer.tplyr_layer <- function(parent, target_var, by, where, type, ...) layer } -#' S3 method for tplyr layer creation of \code{tplyr_subgroup_layer} object as parent -#' @noRd -as_tplyr_layer.tplyr_subgroup_layer <- function(parent, target_var, by, where, type, ...) { - layer <- new_tplyr_layer(parent, target_var, by, where, type, ...) - class(layer) <- unique(append('tplyr_subgroup_layer', class(layer))) - layer -} - #' S3 method to produce error for unsupported objects as parent #' @noRd as_tplyr_layer.default <- function(parent, target_var, by, where, type, ...) { - stop('Must provide `tplyr_table`, `tplyr_layer`, or `tplyr_subgroup_layer` object from the `tplyr` package.') + stop('Must provide `tplyr_table` object from the `Tplyr` package.', call.=FALSE) } #' Create a new tplyr layer diff --git a/R/layer_templates.R b/R/layer_templates.R index 3de3c246..ad0e8865 100644 --- a/R/layer_templates.R +++ b/R/layer_templates.R @@ -177,7 +177,7 @@ get_layer_templates <- function() { #' target, by, and where parameters. #' @param add_params Additional parameters passed into layer modifier functions. #' These arguments are specified in a template within curly brackets such as -#' {param}. Supply as a named list, where the element name is the parameter. +#' \{param\}. Supply as a named list, where the element name is the parameter. #' #' @family Layer Templates #' @rdname layer_templates diff --git a/R/layering.R b/R/layering.R index 6a9e47c5..a6fd67aa 100644 --- a/R/layering.R +++ b/R/layering.R @@ -74,10 +74,10 @@ add_layer <- function(parent, layer, name=NULL) { # Insert the `parent` argument into the topmost call of the layer code # (i.e. if any pipes %>% then pull out the left most call and modify it) - l <- modify_nested_call(layer, parent=parent) + l <- quo_get_expr(modify_nested_call(layer, parent=parent)) # Evaluate the layer and grab `tplyr_layer` or `tplyr_subgroup_layer` object - executed_layer <- list(eval(quo_get_expr(l))) + executed_layer <- list(eval(l, envir=caller_env())) # Attach the name names(executed_layer) <- name @@ -100,10 +100,10 @@ add_layers <- function(parent, ...) { # Parent exists assert_that(!missing(parent), msg = "`parent` parameter must be provided") # all objects are Tplyr layers - map(list(...), assert_is_layer) + map(list2(...), assert_is_layer) # Insert the layer into the parent object - parent$layers <- append(parent$layers, list(...)) + parent$layers <- append(parent$layers, list2(...)) parent } diff --git a/R/meta-builders.R b/R/meta-builders.R index 941b679e..3cae0b2a 100644 --- a/R/meta-builders.R +++ b/R/meta-builders.R @@ -47,7 +47,7 @@ build_desc_meta <- function(target, table_where, layer_where, treat_grps, ...) { # Don't want any of the named parameters here variables <- variables[which(names(variables)=='')] - values <- list(...) + values <- list2(...) # Get rid of text provided by variables inds <- which(map_lgl(unname(variables), ~ quo_class(.) == "name")) @@ -85,7 +85,7 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar # Don't want any of the named parameters here variables <- variables[which(names(variables)=='')] - values <- list(...) + values <- list2(...) # Get rid of text provided by variables inds <- which(map_lgl(unname(variables), ~ quo_class(.) == "name")) @@ -94,6 +94,7 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar # The total row label may not pass through, so set it total_row_label <- ifelse(is.null(layer$total_row_label), 'Total', layer$total_row_label) + missing_subjects_row_label <- ifelse(is.null(layer$total_row_label), 'Missing', layer$missing_subjects_row_label) count_missings <- ifelse(is.null(layer$count_missings), FALSE, layer$count_missings) mlist <- layer$missing_count_list @@ -101,6 +102,9 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar character_outer <- get_character_outer(layer) unnested_character <- is_unnested_character(layer) + # Pull out table object to use later + tbl <- env_parent(layer) + meta <- vector('list', length(values[[1]])) # Vectorize across the input data @@ -113,6 +117,7 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar } row_filter <- list() + aj <- NULL # Pull out the current row's values cur_values <- map(values, ~ .x[i]) @@ -130,21 +135,26 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar if (summary_var[i] == total_row_label && !count_missings) { # Filter out the missing counts if the total row should exclude missings row_filter <- make_parsed_strings(layer$target_var, list(mlist), negate=TRUE) - } - else if (summary_var[i] %in% names(mlist)) { + } else if (summary_var[i] == missing_subjects_row_label) { + # Special handling for missing subject rows + # Make a meta object for the pop data + pop_filt_inds <- which(filter_variables %in% unlist(list(tbl$treat_var, tbl$cols))) + pop_filt_vars <- filter_variables[pop_filt_inds] + pop_filt_vals <- filter_values[pop_filt_inds] + pop_meta <- build_meta(tbl$pop_where, quo(TRUE), treat_grps, pop_filt_vars, pop_filt_vals) + aj <- new_anti_join(join_meta=pop_meta, on=layer$distinct_by) + } else if (summary_var[i] %in% names(mlist)) { # Get the values for the missing row miss_val <- mlist[which(names(mlist) == summary_var[i])] row_filter <- make_parsed_strings(layer$target_var, list(miss_val)) - } - else if (summary_var[i] != total_row_label) { + } else if (summary_var[i] != total_row_label) { # Subset to outer layer value row_filter <- make_parsed_strings(na_var, summary_var[i]) } add_vars <- append(add_vars, na_var) - } - else { + } else { # Inside the nested layer filter_variables <- variables filter_values <- cur_values @@ -162,6 +172,18 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar else if (summary_var[i] == total_row_label && !count_missings) { # Filter out the missing counts if the total row should exclude missings row_filter <- make_parsed_strings(layer$target_var, list(mlist), negate=TRUE) + } else if (summary_var[i] == missing_subjects_row_label) { + # Special handling for missing subject rows + # Make a meta object for the pop data + pop_filt_inds <- which(filter_variables %in% unlist(list(tbl$treat_var, tbl$cols))) + pop_filt_vars <- filter_variables[pop_filt_inds] + pop_filt_vals <- filter_values[pop_filt_inds] + # Reset to the pop treat value + pop_filt_vars[[ + which(map_chr(pop_filt_vars, as_label) == as_label(tbl$treat_var)) + ]] <- tbl$pop_treat_var + pop_meta <- build_meta(tbl$pop_where, quo(TRUE), treat_grps, pop_filt_vars, pop_filt_vals) + aj <- new_anti_join(join_meta=pop_meta, on=layer$distinct_by) } else if (!is.na(character_outer) && summary_var[i] == character_outer) { # If the outer layer is a character string then don't provide a filter @@ -176,8 +198,8 @@ build_count_meta <- function(layer, table_where, layer_where, treat_grps, summar # Make the meta object meta[[i]] <- build_meta(table_where, layer_where, treat_grps, filter_variables, filter_values) %>% add_filters_(row_filter) %>% - add_variables_(add_vars) - + add_variables_(add_vars) %>% + add_anti_join_(aj) } meta @@ -220,7 +242,7 @@ build_shift_meta <- function(layer, table_where, layer_where, treat_grps, summar # Don't want any of the named parameters here variables <- variables[which(names(variables)=='')] - values <- list(...) + values <- list2(...) # Get rid of text provided by variables inds <- which(map_lgl(unname(variables), ~ quo_class(.) == "name")) diff --git a/R/meta.R b/R/meta.R index 78ccaa2f..86da7d37 100644 --- a/R/meta.R +++ b/R/meta.R @@ -221,8 +221,93 @@ print.tplyr_meta <- function(x, ...) { cat("Names:\n") names <- map_chr(x$names, as_label) filters <- map_chr(x$filters, as_label) - cat(" ", paste(names, collapse = ", "), "\n") + cat(" ", paste0(names, collapse = ", "), "\n") cat("Filters:\n") - cat(" ", paste(filters, collapse = ", "), "\n") + cat(" ", paste0(filters, collapse = ", "), "\n") + if (!is.null(x$anti_join)) { + cat("Anti-join:\n") + cat(" Join Meta:\n") + cat(paste0(" ", capture.output(x$anti_join$join_meta), "\n"), sep="") + cat(" On:\n") + aj_on <- map_chr(x$anti_join$on, as_label) + cat(" ", paste0(aj_on, collapse = ", "), "\n") + } invisible() } + +#' Create an tplyr_meta_anti_join object +#' +#' @return tplyr_meta_anti_join object +#' @noRd +new_anti_join <- function(join_meta, on) { + structure( + list( + join_meta = join_meta, + on = on + ), + class="tplyr_meta_anti_join" + ) +} + +#' Internal application of anti_join onto tplyr_meta object +#' @noRd +add_anti_join_ <- function(meta, aj) { + meta$anti_join <- aj + meta +} + +#' Add an anti-join onto a tplyr_meta object +#' +#' An anti-join allows a tplyr_meta object to refer to data that should be +#' extracted from a separate dataset, like the population data of a Tplyr table, +#' that is unavailable in the target dataset. The primary use case for this is +#' the presentation of missing subjects, which in a Tplyr table is presented +#' using the function `add_missing_subjects_row()`. The missing subjects +#' themselves are not present in the target data, and are thus only available in +#' the population data. The `add_anti_join()` function allows you to provide the +#' meta information relevant to the population data, and then specify the `on` +#' variable that should be used to join with the target dataset and find the +#' values present in the population data that are missing from the target data. +#' +#' @param meta A tplyr_meta object referring to the target data +#' @param join_meta A tplyr_meta object referring to the population data +#' @param on A list of quosures containing symbols - most likely set to USUBJID. +#' +#' @return A tplyr_meta object +#' @md +#' @export +#' +#' @examples +#' +#' tm <- tplyr_meta( +#' rlang::quos(TRT01A, SEX, ETHNIC, RACE), +#' rlang::quos(TRT01A == "Placebo", TRT01A == "SEX", ETHNIC == "HISPANIC OR LATINO") +#' ) +#' +#' tm %>% +#' add_anti_join( +#' tplyr_meta( +#' rlang::quos(TRT01A, ETHNIC), +#' rlang::quos(TRT01A == "Placebo", ETHNIC == "HISPANIC OR LATINO") +#' ), +#' on = rlang::quos(USUBJID) +#' ) +add_anti_join <- function(meta, join_meta, on){ + + if (!inherits(meta, 'tplyr_meta')) { + stop("meta must be a tplyr_meta object", call.=FALSE) + } + + if (!inherits(join_meta, 'tplyr_meta')) { + stop("join_meta must be a tplyr_meta object", call.=FALSE) + } + + if (!all(map_lgl(on, ~ is_quosure(.) && quo_is_symbol(.)))) { + stop("on must be provided as a list of names", call.=FALSE) + } + + + aj <- new_anti_join(join_meta, on) + + add_anti_join_(meta, aj) +} diff --git a/R/meta_utils.R b/R/meta_utils.R index fabbbcbd..3d0fc36e 100644 --- a/R/meta_utils.R +++ b/R/meta_utils.R @@ -54,7 +54,7 @@ get_meta_result <- function(x, row_id, column, ...) { get_meta_result.tplyr_table <- function(x, row_id, column, ...) { m <- x$metadata - get_meta_result.data.frame(m, row_id, column) + get_meta_result.data.frame(m, row_id, column, ...) } #' @export @@ -69,6 +69,10 @@ get_meta_result.data.frame <- function(x, row_id, column, ...) { 'column present in the built Tplyr dataframe'), call.=FALSE) } + if (length(list(...)) > 0) { + warning("Extra arguments were provided to get_meta_result() that will not be used.", immediate.=TRUE) + } + # Pull out the cell of interest res <- x[[which(x$row_id == row_id), column]][[1]] @@ -109,6 +113,8 @@ get_meta_result.data.frame <- function(x, row_id, column, ...) { #' @param column The result column of interest, provided as a character string #' @param add_cols Additional columns to include in subset data.frame output #' @param target A data frame to be subset (if not pulled from a Tplyr table) +#' @param pop_data A data frame to be subset through an anti-join (if not pulled +#' from a Tplyr table) #' @param ... additional arguments #' #' @return A data.frame @@ -139,7 +145,8 @@ get_meta_subset <- function(x, row_id, column, add_cols = vars(USUBJID), ...) { #' @export #' @rdname get_meta_subset get_meta_subset.data.frame <- function(x, row_id, column, - add_cols = vars(USUBJID), target = NULL, ...) { + add_cols = vars(USUBJID), + target = NULL, pop_data = NULL, ...) { # Get the metadata object ready m <- get_meta_result(x, row_id, column) @@ -152,9 +159,33 @@ get_meta_subset.data.frame <- function(x, row_id, column, stop("If querying metadata without a tplyr_table, a target must be provided", call.=FALSE) } - target %>% + if (length(list(...)) > 0) { + warning("Extra arguments were provided to get_meta_subset() that will not be used.", immediate.=TRUE) + } + + out <- target %>% filter(!!!m$filters) %>% select(!!!add_cols, !!!m$names) + + if (!is.null(m$anti_join)) { + aj <- m$anti_join + pd <- pop_data %>% + filter(!!!aj$join_meta$filters) %>% + select(!!!aj$on, !!!add_cols, !!!aj$join_meta$names) + + mrg_var <- map_chr(aj$on, as_name) + names(mrg_var) <- mrg_var + + if (!(mrg_var %in% names(pd)) | !(mrg_var %in% names(out))) { + stop(paste0( + "The `on` variable specified is missing from either the target data or the population data subsets.\n ", + "Try adding the `on` variables to the `add_cols` parameter") + ) + } + out <- anti_join(pd, out, by=mrg_var) + } + + out } #' @export @@ -164,13 +195,7 @@ get_meta_subset.tplyr_table <- function(x, row_id, column, add_cols = vars(USUBJ # Get the metadata object ready m <- get_meta_result(x, row_id, column) - if (!inherits(add_cols, 'quosures')) { - stop("add_cols must be provided using `dplyr::vars()`", call.=FALSE) - } - - # Subset and return the data - x$target %>% - filter(!!!m$filters) %>% - select(!!!add_cols, !!!m$names) + get_meta_subset(x$metadata, row_id, column, add_cols = add_cols, + target = x$target, pop_data = x$pop_data) } diff --git a/R/nested.R b/R/nested.R index 322b1588..9843c188 100644 --- a/R/nested.R +++ b/R/nested.R @@ -8,14 +8,6 @@ process_nested_count_target <- function(x) { assert_that(quo_is_symbol(target_var[[2]]), msg = "Inner layers must be data driven variables") - if(quo_is_symbol(target_var[[1]])){ - first_var_length <- length(unique(target[[as_name(target_var[[1]])]])) - second_var_length <- length(unique(target[[as_name(target_var[[2]])]])) - - assert_that(second_var_length >= first_var_length, - msg = "The number of values of your second variable must be greater than the number of levels in your first variable") - } - if(is.factor(target[[as_name(target_var[[1]])]])) { warning(paste0("Factors are not currently supported in nested count layers", " that have two data driven variables. Factors will be coerced into character vectors"), @@ -40,9 +32,15 @@ process_nested_count_target <- function(x) { second_denoms_by <- denoms_by } - first_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[1]], - by = vars(!!!by), where = !!where)) + # Missing subject counts should not occur in the outer layer + fl <- group_count(current_env(), target_var = !!target_var[[1]], + by = vars(!!!by), where = !!where) + fl$include_missing_subjects_row <- FALSE + outer_ <- TRUE + first_layer <- process_summaries(fl) + + outer_ <- FALSE second_layer <- process_summaries(group_count(current_env(), target_var = !!target_var[[2]], by = vars(!!target_var[[1]], !!!by), where = !!where) %>% set_count_row_prefix(indentation) %>% @@ -58,7 +56,8 @@ process_nested_count_target <- function(x) { treat_var = treat_var ) %>% group_by(!!target_var[[1]]) %>% - do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation)) + do(filter_nested_inner_layer(., target, target_var[[1]], target_var[[2]], indentation, + missing_subjects_row_label)) ignored_filter_rows <- ifelse(include_total_row, ifelse(is.null(total_row_label), @@ -93,7 +92,8 @@ process_nested_count_target <- function(x) { #' This function is meant to remove the values of an inner layer that don't #' appear in the target data #' @noRd -filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation) { +filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, indentation, + missing_subjects_row_label) { # Is outer variable text? If it is don't filter on it text_outer <- !quo_is_symbol(outer_name) @@ -116,9 +116,13 @@ filter_nested_inner_layer <- function(.group, target, outer_name, inner_name, in filter(!!sym(outer_name) == current_outer_value) %>% select(any_of(inner_name)) %>% unlist() %>% - paste0(indentation, .) + paste0(indentation, .) %>% + unique() } + target_inner_values <- c(target_inner_values %>% unique(), + paste0(indentation, missing_subjects_row_label)) + .group %>% filter(summary_var %in% target_inner_values) diff --git a/R/pop_data.R b/R/pop_data.R index d8af419d..9e79f1d0 100644 --- a/R/pop_data.R +++ b/R/pop_data.R @@ -36,27 +36,33 @@ build_header_n <- function(table) { #' Combine existing treatment groups for summary #' -#' Summary tables often present individual treatment groups, -#' but may additionally have a "Treatment vs. Placebo" or "Total" group added -#' to show grouped summary statistics or counts. This set of functions offers -#' an interface to add these groups at a table level and be consumed by -#' subsequent layers. +#' Summary tables often present individual treatment groups, but may +#' additionally have a "Treatment vs. Placebo" or "Total" group added to show +#' grouped summary statistics or counts. This set of functions offers an +#' interface to add these groups at a table level and be consumed by subsequent +#' layers. #' #' \code{add_treat_grps} allows you to specify specific groupings. This is done -#' by supplying named arguments, where the name becomes the new treatment group's -#' name, and those treatment groups are made up of the argument's values. +#' by supplying named arguments, where the name becomes the new treatment +#' group's name, and those treatment groups are made up of the argument's +#' values. #' -#' \code{add_total_group} is a simple wrapper around \code{add_treat_grps}. Instead of -#' producing custom groupings, it produces a "Total" group by the supplied name, which -#' defaults to "Total". This "Total" group is made up of all existing treatment -#' groups within the population dataset. +#' \code{add_total_group} is a simple wrapper around \code{add_treat_grps}. +#' Instead of producing custom groupings, it produces a "Total" group by the +#' supplied name, which defaults to "Total". This "Total" group is made up of +#' all existing treatment groups within the population dataset. #' -#' The function \code{treat_grps} allows you to see the custom treatment groups available -#' in your \code{tplyr_table} object +#' Note that when using \code{add_treat_grps} or \code{add_total_row()} with +#' \code{set_pop_data()}, you should call \code{add_total_row()} AFTER calling +#' \code{set_pop_data()}, otherwise there is potential for unexpected behaivior +#' with treatment groups. +#' +#' The function \code{treat_grps} allows you to see the custom treatment groups +#' available in your \code{tplyr_table} object #' #' @param table A \code{tplyr_table} object -#' @param ... A named vector where names will become the new treatment group names, -#' and values will be used to construct those treatment groups +#' @param ... A named vector where names will become the new treatment group +#' names, and values will be used to construct those treatment groups #' #' @return The modified table object #' @export @@ -81,13 +87,13 @@ build_header_n <- function(table) { add_treat_grps <- function(table, ...) { - assert_that(is_named(list(...)), msg="Treatment group arguments must have names") + assert_that(is_named(list2(...)), msg="Treatment group arguments must have names") assert_that(inherits(table, "tplyr_table"), msg = "Treatment groups can only be added to `tplyr_table` objects") # Check parameters - fargs <- list(...) + fargs <- list2(...) # Bind the specified treatment groups to the table env_bind(table, treat_grps = append(treat_grps(table), fargs)) diff --git a/R/replace_leading_whitespace.R b/R/replace_leading_whitespace.R new file mode 100644 index 00000000..94907755 --- /dev/null +++ b/R/replace_leading_whitespace.R @@ -0,0 +1,34 @@ +#' Reformat strings with leading whitespace for HTML +#' +#' @param x Target string +#' @param tab_width Number of spaces to compensate for tabs +#' +#' @return String with   replaced for leading whitespace +#' @export +#' +#' @examples +#' x <- c(" Hello there", " Goodbye Friend ", "\tNice to meet you", +#' " \t What are you up to? \t \t ") +#' replace_leading_whitespace(x) +#' +#' replace_leading_whitespace(x, tab=2) +#' +replace_leading_whitespace <- function(x, tab_width=4) { + # Pull out the leading whitespace chunk + leading_spaces <- str_match(x, "^([ \\t])+")[,1] + # Count spaces and tabs, factor in tab width + spaces <- str_count(leading_spaces, pattern = " ") + tabs <- str_count(leading_spaces, pattern = "\\t") * tab_width + leading_length <- as.integer(spaces + tabs) + + # Build the   string and combine with the trimmed string + nbsp_string <- map_chr(leading_length, \(.x) { + if (!is.na(.x)) { + paste(rep(" ", .x), collapse="") + } else { + "" + }}) + minus_whitespace <- str_trim(x, side='left') + paste(nbsp_string, minus_whitespace, sep="") +} + diff --git a/R/riskdiff.R b/R/riskdiff.R index 9cbd53f2..be17c0b8 100644 --- a/R/riskdiff.R +++ b/R/riskdiff.R @@ -115,7 +115,7 @@ add_risk_diff <- function(layer, ..., args=list(), distinct=TRUE) { # grab the ellipsis args into a list - comps <- list(...) + comps <- list2(...) # Must be character, must have 2 elements assert_that(all(map_lgl(comps, is.character)), all(map_lgl(comps, ~ length(.x) == 2)), @@ -242,7 +242,7 @@ riskdiff <- function(diff_group, n_comp, n_ref, total_comp, total_ref, args=list high = NA ) - out <- append(list(...), out) + out <- append(list2(...), out) # Rename # Totals in the 2 way must be positive @@ -265,10 +265,10 @@ riskdiff <- function(diff_group, n_comp, n_ref, total_comp, total_ref, args=list construct_riskdiff_string <- function(..., .fmt_str=NULL) { # Unpack names into current namespace for ease - list2env(list(...), envir=environment()) + list2env(list2(...), envir=environment()) # Return empty when necessary - if (any(is.na(list(...)))) { + if (any(is.na(list2(...)))) { return(.fmt_str$empty) } diff --git a/R/set_format_strings.R b/R/set_format_strings.R index 651dd4d2..49b488a6 100644 --- a/R/set_format_strings.R +++ b/R/set_format_strings.R @@ -130,7 +130,7 @@ set_format_strings.desc_layer <- function(e, ..., cap=getOption('tplyr.precision # Pick off the ellipsis - format_strings <- list(...) + format_strings <- list2(...) # Get the list of variable names that need to be transposed @@ -185,9 +185,10 @@ set_format_strings.count_layer <- function(e, ...) { e } +#' @export set_format_strings.shift_layer <- function(e, ...) { - dots <- list(...) + dots <- list2(...) assert_that(all(dots$vars %in% c("n", "pct")), msg = "formats in shift layers can only be n") diff --git a/R/set_limit_data_by.R b/R/set_limit_data_by.R new file mode 100644 index 00000000..321c3366 --- /dev/null +++ b/R/set_limit_data_by.R @@ -0,0 +1,134 @@ +#' Set variables to limit reported data values only to those that exist rather +#' than fully completing all possible levels +#' +#' This function allows you to select a combination of by variables or +#' potentially target variables for which you only want to display values +#' present in the data. By default, Tplyr will create a cartesian combination of +#' potential values of the data. For example, if you have 2 by variables +#' present, then each potential combination of those by variables will have a +#' row present in the final table. `set_limit_data_by()` allows you to choose +#' the by variables whose combination you wish to limit to values physically +#' present in the available data. +#' +#' @param e A tplyr_layer +#' @param ... Subset of variables within by or target variables +#' +#' @return a tplyr_table +#' @md +#' @export +#' +#' @examples +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PARAM, AVISIT) +#' ) %>% +#' build() +#' +#' tplyr_table(tplyr_adpe, TRT01A) %>% +#' add_layer( +#' group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% +#' set_limit_data_by(PECAT, PARAM, AVISIT) +#' ) %>% +#' build() +set_limit_data_by <- function(e, ...) { + UseMethod("set_limit_data_by") +} + +#' @export +#' @noRd +set_limit_data_by.count_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + tv_ <- map_chr(env_get(e, "target_var"), as_name) + + if (!all(dots_chr %in% c(by_, tv_))) { + stop("Limit data by variables must be included in by variables or target variable set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' @export +#' @noRd +set_limit_data_by.shift_layer <- function(e, ...) { + set_limit_data_by.count_layer(e, ...) +} + +#' @export +#' @noRd +set_limit_data_by.desc_layer <- function(e, ...) { + dots <- enquos(...) + dots_chr <- map_chr(dots, as_name) + + # Pull these variables to make sure the denoms used make sense + by_ <- map_chr(env_get(e, "by"), as_name) + + if (!all(dots_chr %in% by_)) { + stop("Limit data by variables must be included in by variables set on layer", call.=FALSE) + } + + env_bind(e, limit_data_by = dots) + e +} + +#' General function used to process the steps to pad levels in data, or limit to +#' combinations available in the data itself +#' +#' @param dat Input dataset +#' @param treat_var treat_var from tplyr_table +#' @param by by from tplyr_layer +#' @param cols cols from tplyr_table +#' @param target_var target_var from tplyr_layer +#' @param limit_data_by The variables to limit data by +#' @param .fill .fill parameter passed onto dplyr::complete +#' @param outer Whether to bypass variables if working through the outer layer +#' +#' @noRd +complete_and_limit <- function(dat, treat_var, by, cols, target_var=quos(), limit_data_by, .fill=list(), outer=FALSE) { + + complete_levels <- dat %>% + # complete all combinations of factors to include combinations that don't exist. + # add 0 for combinations that don't exist + complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, + fill = .fill) + + # Apply data limits specified by setter + if (!is.null(limit_data_by)) { + # Outer layer won't have the target variable to limit by + if (outer) { + limit_data_by <- limit_data_by[map_chr(limit_data_by, as_name) %in% names(dat)] + } + + # Find the combinations actually in the data + groups_in_data <- dat %>% + distinct(!!!limit_data_by) + + # Join back to limit the completed levels based on the preferred + # data driven ones + limited_data <- groups_in_data %>% + left_join(complete_levels, by = map_chr(limit_data_by, as_name)) + + return(limited_data) + } + + complete_levels +} diff --git a/R/shift.R b/R/shift.R index a6a38d7b..c6a42d9f 100644 --- a/R/shift.R +++ b/R/shift.R @@ -39,7 +39,6 @@ process_summaries.shift_layer <- function(x, ...) { process_shift_n <- function(x) { evalq({ - numeric_data <- built_target %>% # Group by variables including target variables and count them group_by(!!treat_var, !!!by, !!!unname(target_var), !!!cols) %>% @@ -47,7 +46,9 @@ process_shift_n <- function(x) { ungroup() %>% # complete all combinations of factors to include combinations that don't exist. # add 0 for combinations that don't exist - complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + # complete(!!treat_var, !!!by, !!!unname(target_var), !!!cols, fill = list(n = 0)) %>% + complete_and_limit(treat_var, by, cols, unname(target_var), + limit_data_by, .fill = list(n = 0)) %>% # Change the treat_var and first target_var to characters to resolve any # issues if there are total rows and the original column is numeric mutate(!!treat_var := as.character(!!treat_var)) %>% diff --git a/R/sort.R b/R/sort.R index d4168df3..d36563a2 100644 --- a/R/sort.R +++ b/R/sort.R @@ -194,20 +194,28 @@ add_order_columns.count_layer <- function(x) { expr(!!sym(as_name(x)) == !!as_name(y)) }) + # Get the number of unique outer values, that is the number of rows to pull out. # If its text, it is just 1 to pull out - outer_number <- ifelse(quo_is_symbol(by[[1]]), - # Use built_target here to take the 'where' logic into account - length(unlist(unique(built_target[, as_name(by[[1]])]))), - 1) + # outer_number <- ifelse(quo_is_symbol(by[[1]]), + # # Use built_target here to take the 'where' logic into account + # nrow(filter(numeric_data, is.na(!!by[[1]]))), + # 1) + + # Identify the outer layer and attach it to the filter logic + filter_logic <- append(filter_logic, ifelse( + quo_is_symbol(by[[1]]), # Is the outside variable character or a symbol? + exprs(is.na(!!by[[1]])), # For symbols, the outer var will be NA + exprs(summary_var == !!by[[1]]) # For character, it will match summary_var + )) all_outer <- numeric_data %>% - filter(!!!filter_logic) %>% - extract(1:min(nrow(.), outer_number), ) + filter(!!!filter_logic) # Add the ordering of the pieces in the layer formatted_data <- formatted_data %>% - group_by(.data[[paste0("ord_layer_", formatted_col_index - 1)]]) %>% + # group_by(.data[[paste0("row_label", formatted_col_index - 1)]]) %>% + group_by(row_label1) %>% do(add_data_order_nested(., formatted_col_index - 1, numeric_data, indentation_length = indentation_length, ordering_cols = ordering_cols, @@ -222,7 +230,9 @@ add_order_columns.count_layer <- function(x) { break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column)) %>% + numeric_cutoff_column = numeric_cutoff_column, + missing_subjects_row_label = missing_subjects_row_label, + missing_subjects_sort_value = missing_subjects_sort_value)) %>% ungroup() if (!is.null(nest_count) && nest_count) { @@ -352,7 +362,8 @@ add_order_columns.shift_layer <- function(x) { # The logic is the same now for a byvarn so reuse that function formatted_data[, paste0("ord_layer_", formatted_col_index)] <- get_data_order_byvarn(formatted_data, fact_df, as_name(target_var$row), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) rm(formatted_col_index) @@ -428,11 +439,16 @@ get_data_order <- function(x, formatted_col_index) { if(!is.null(missing_string)) missing_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_string) if(!is.null(total_row_label)) total_index <- which(unlist(formatted_data[, label_row_ind]) %in% total_row_label) + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_subjects_row_label) + } + # No processing is needed here just pass in the needed info get_data_order_bycount(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index, missing_sort_value, total_index, total_row_sort_value, + missing_subjects_index, missing_subjects_sort_value, break_ties = break_ties, numeric_cutoff = numeric_cutoff, numeric_cutoff_stat = numeric_cutoff_stat, @@ -466,7 +482,8 @@ get_data_order <- function(x, formatted_col_index) { } get_data_order_byvarn(formatted_data, varn_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) # Here it is 'byfactor' @@ -521,7 +538,8 @@ get_data_order <- function(x, formatted_col_index) { # The logic is the same now for a byvarn so reuse that function get_data_order_byvarn(formatted_data, fact_df, as_name(target_var[[1]]), - formatted_col_index, total_row_sort_value = total_row_sort_value) + formatted_col_index, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) } }, envir = x) } @@ -532,6 +550,8 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, treat_var, by, cols, result_order_var, target_var, missing_index = NULL, missing_sort_value = NULL, total_index = NULL, total_row_sort_value = NULL, + missing_subjects_index = NULL, + missing_subjects_sort_value = NULL, break_ties, numeric_cutoff, numeric_cutoff_stat, numeric_cutoff_column, nested = FALSE) { @@ -606,6 +626,10 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, numeric_ordering_data[total_index,] <- total_row_sort_value } + if(!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) { + numeric_ordering_data[missing_subjects_index,] <- missing_subjects_sort_value + } + # This is the numeric index that the numeric data is in. radix was chosen because # its the only method that gives indicies as far as I can tell # x are the values @@ -632,7 +656,8 @@ get_data_order_bycount <- function(numeric_data, ordering_cols, } get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_index, - indentation = "", total_row_sort_value = NULL) { + indentation = "", total_row_sort_value = NULL, + missing_subjects_sort_value = NULL) { # Pull out the by values in the formatted data. by_values <- unlist(formatted_data[, by_column_index]) @@ -650,6 +675,8 @@ get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_ # Flag to determine where total row is positioned if(!is.null(total_row_sort_value)) { total_row_sort_value + } else if (!is.null(missing_subjects_sort_value)){ + missing_subjects_sort_value } else { max(by_varn_df[,2]) + 1 } @@ -686,7 +713,7 @@ get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { # Pull out dots - list2env(list(...), envir = environment()) + list2env(list2(...), envir = environment()) # Here are the names of the formatted data row labels. We usually only work with the last row_label_vec <- vars_select(names(group_data), starts_with("row_label")) @@ -694,20 +721,19 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { ##### Outer nest values ##### # The value of the outer label outer_value <- group_data[1, tail(row_label_vec, 1)][[1]] + # Reserve for joins + mrg_by <- paste0("row_label", seq_along(by))[-1] if(order_count_method[1] == "byvarn") { varn_df <- get_varn_values(target, as_name(by[[1]])) all_outer$..index <- group_data[1,] %>% - get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value) + get_data_order_byvarn(varn_df, by[[1]], final_col, total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) - } else if(order_count_method[1] == "bycount") { + } else if(order_count_method[1] == "bycount") { all_outer$..index <- all_outer %>% get_data_order_bycount(ordering_cols, treat_var, vars(!!!head(by, -1)), cols, result_order_var, vars(!!by[[1]], !!target_var), @@ -716,64 +742,103 @@ add_data_order_nested <- function(group_data, final_col, numeric_data, ...) { numeric_cutoff_stat = numeric_cutoff_stat, numeric_cutoff_column = numeric_cutoff_column, nested = TRUE) + } + - group_data[, paste0("ord_layer_", final_col)] <- all_outer %>% - filter(summary_var == outer_value) %>% - ungroup() %>% - select(..index) + # Grab the index created above and insert it into group data + if (order_count_method[1] %in% c("bycount", "byvarn")){ + if (length(mrg_by) == 0) { + group_data[,"ord_layer_1"] <- all_outer %>% + filter(summary_var == outer_value) %>% + ungroup() %>% + pull(..index) + } else { + group_data[,"ord_layer_1"] <- group_data %>% + left_join( + all_outer %>% + filter(summary_var == outer_value) %>% + replace_by_string_names(c(by, quo(summary_var))) %>% + select(starts_with('row'), ..index, -c(row_label1, !!treat_var)), + by = mrg_by + ) %>% + pull(..index) + } } - present_vars <- unlist(group_data[-1, row_label_vec[length(row_label_vec)]]) + outer_nest_rows <- group_data %>% + filter(!!sym(tail(row_label_vec, 1)) == outer_value) %>% + nrow() + + present_vars <- group_data[(outer_nest_rows + 1): nrow(group_data),][[row_label_vec[length(row_label_vec)]]] + ##### Inner nest values ##### filtered_numeric_data <- numeric_data %>% # Only include the parts of the numeric data that is in the current label - filter(numeric_data$summary_var %in% present_vars, !is.na(!!by[[1]])) %>% + filter(numeric_data$summary_var %in% present_vars, !!by[[1]] == outer_value) %>% # Remove nesting prefix to prepare numeric data. - mutate(summary_var := str_sub(summary_var, indentation_length)) - + mutate(summary_var := str_sub(summary_var, indentation_length+1)) #Same idea here, remove prefix - filtered_group_data <- group_data[-1, ] %>% + filtered_group_data <- tail(group_data, -outer_nest_rows) %>% mutate(!!row_label_vec[length(row_label_vec)] := str_sub(.data[[row_label_vec[length(row_label_vec)]]], indentation_length + 1)) + + # Identify the index of missing subjects + if(!is.null(missing_subjects_row_label)) { + missing_subjects_index <- which(filtered_group_data[[length(row_label_vec)]] %in% missing_subjects_row_label) + } + # The first row is always the first thing in the order so make it Inf - group_data[1, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) + group_data[1:outer_nest_rows, paste0("ord_layer_", final_col + 1)] <- ifelse((is.null(outer_inf) || outer_inf), Inf, -Inf) if(tail(order_count_method, 1) == "bycount") { if (nrow(group_data) > 1) { - group_data[-1 , paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, - ordering_cols, - treat_var, - head(by, -1), - cols, - result_order_var, - target_var, - break_ties = break_ties, - numeric_cutoff = numeric_cutoff, - numeric_cutoff_stat = numeric_cutoff_stat, - numeric_cutoff_column = numeric_cutoff_column, - nested = TRUE) - } - } else if(tail(order_count_method, 1) == "byvarn") { - - varn_df <- get_varn_values(target, target_var[[1]]) - - - - group_data[-1, paste0("ord_layer_", final_col + 1)] <- get_data_order_byvarn(filtered_group_data, - varn_df, - target_var[[1]], - length(by) + 1, - indentation, - total_row_sort_value = total_row_sort_value) - - } else { - - group_row_count <- nrow(group_data[-1,]) - # Logic for group_row_count is when numeric_where values cause unexpected results - group_data[-1, paste0("ord_layer_", final_col + 1)] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1)] <- get_data_order_bycount(filtered_numeric_data, + ordering_cols, + treat_var, + head(by, -1), + cols, + result_order_var, + target_var, + break_ties = break_ties, + missing_subjects_index = missing_subjects_index, + missing_subjects_sort_value = missing_subjects_sort_value, + numeric_cutoff = numeric_cutoff, + numeric_cutoff_stat = numeric_cutoff_stat, + numeric_cutoff_column = numeric_cutoff_column, + nested = TRUE) + } + } else if(tail(order_count_method, 1) == "byvarn") { + + varn_df <- get_varn_values(target, target_var[[1]]) + + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- get_data_order_byvarn(filtered_group_data, + varn_df, + target_var[[1]], + length(by) + 1, + indentation, + total_row_sort_value = total_row_sort_value, + missing_subjects_sort_value = missing_subjects_sort_value) + } else { + group_row_count <- nrow(group_data[(outer_nest_rows + 1): nrow(group_data),]) + # Logic for group_row_count is when numeric_where values cause unexpected results + group_data[ + (outer_nest_rows + 1): nrow(group_data), + paste0("ord_layer_", final_col + 1) + ] <- 1:ifelse(group_row_count == 0, 1, group_row_count) + + # missing_subjects_row_label not passing in here + if (!is.null(missing_subjects_sort_value)) { + missing_rows <- group_data[[paste0('row_label', final_col+1)]] == paste0(indentation, missing_subjects_row_label) + group_data[missing_rows, paste0("ord_layer_", final_col + 1)] <- missing_subjects_sort_value + } } group_data diff --git a/R/table.R b/R/table.R index a2441b3a..f9c79f6c 100644 --- a/R/table.R +++ b/R/table.R @@ -41,16 +41,7 @@ #' tab <- tplyr_table(iris, Species, where = Sepal.Length < 5.8) #' tplyr_table <- function(target, treat_var, where = TRUE, cols = vars()) { - - if(missing(target)){ - # return a blank environment if no table information is passed. This can be - # used as a placeholder when creating a table if the dataset is not available. - return(structure(rlang::env(), - class = c("tplyr_table", "environment"))) - } - target_name <- enexpr(target) - new_tplyr_table(target, enquo(treat_var), enquo(where), enquos(cols), target_name) } diff --git a/R/table_bindings.R b/R/table_bindings.R index 96bc2ea6..91f817fb 100644 --- a/R/table_bindings.R +++ b/R/table_bindings.R @@ -89,6 +89,8 @@ set_header_n <- function(table, value) { #' #' pop_data(tab) <- mtcars #' +#' tab <- tplyr_table(iris, Species) %>% +#' set_pop_data(mtcars) #' @export #' @rdname pop_data pop_data <- function(table) { @@ -108,6 +110,7 @@ pop_data <- function(table) { #' #' @export #' @rdname pop_data +#' set_pop_data <- function(table, pop_data) { pop_data_name <- enexpr(pop_data) # table should be a data.frame @@ -319,7 +322,7 @@ get_desc_layer_formats <- function(obj) { #' @rdname table_format_defaults set_desc_layer_formats <- function(obj, ...) { # Bind the formats into the table - env_bind(obj, desc_layer_formats = list(...)) + env_bind(obj, desc_layer_formats = list2(...)) obj } @@ -337,8 +340,8 @@ get_count_layer_formats <- function(obj) { set_count_layer_formats <- function(obj, ...) { # Bind the formats into the table - if (length(list(...)) > 0) params <- count_f_str_check(...) - else params <- list(...) + if (length(list2(...)) > 0) params <- count_f_str_check(...) + else params <- list2(...) env_bind(obj, count_layer_formats = params) obj @@ -355,6 +358,6 @@ get_shift_layer_formats <- function(obj) { #' @rdname table_format_defaults set_shift_layer_formats <- function(obj, ...) { # Bind the formats into the table - env_bind(obj, shift_layer_formats = list(...)) + env_bind(obj, shift_layer_formats = list2(...)) obj } diff --git a/R/utils.R b/R/utils.R index 1187d100..fa47cc17 100644 --- a/R/utils.R +++ b/R/utils.R @@ -12,7 +12,7 @@ modify_nested_call <- function(c, examine_only=FALSE, ...) { # Get exports from Tplyr - allowable_calls = getNamespaceExports("Tplyr") + allowable_calls <- getNamespaceExports("Tplyr") # Only allow the user to use `Tplyr` functions assert_that( @@ -154,22 +154,6 @@ replace_by_string_names <- function(dat, by, treat_var = NULL) { mutate_at(row_labels, ~ as.character(.x)) # Coerce all row labels into character } -#' Get the unique levels/factors of a dataset -#' -#' @param e An environment, generally a table or a layer object -#' @param x A target variable to get the levels/unique values of -#' -#' @return Unique target values -#' @noRd -get_target_levels <- function(e, x) { - # If its a factor just return the levels - if(is.factor(env_get(e, "target", inherit = TRUE)[, as_name(x)])) levels(env_get(e, "built_target", inherit = TRUE)[, as_name(x)]) - # Otherwise return the unique values - else { - unique(env_get(e, "built_target", inherit = TRUE)[, as_name(x)]) - } -} - #' Replace repeating row label variables with blanks in preparation for display. #' #' Depending on the display package being used, row label values may need to be @@ -267,22 +251,6 @@ extract_character_from_quo <- function(var_list) { var_list[!is_symbol_] } -#' Get maximum string format recursivly -#' -#' @param lay A layer object -#' -#' @return Maximum length of sub layers -#' @noRd -get_max_length <- function(lay) { - # Initalize max_ to -1 - max_ <- -1L - # Get maximum length of all sub layers - if(length(lay$layers) > 0) max_ <- max(map_int(lay$layers, get_max_length)) - - # return greatest between sub layers and current layer - max(max_, lay$format_strings$size) -} - #' Clean variable attributes #' #' @param dat Dataframe to strip of variable attributes @@ -314,8 +282,8 @@ ut_round <- function(x, n=0) { # x is the value to be rounded # n is the precision of the rounding - posneg <- sign(x) - e <- abs(x) * 10^n + posneg <- sign(x) + e <- abs(x) * 10^n e <- e + 0.5 + sqrt(.Machine$double.eps) e <- trunc(e) e <- e / 10^n diff --git a/R/zzz.R b/R/zzz.R index 53944b32..5b1989d4 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,17 +1,17 @@ #' @importFrom rlang env enquo enquos caller_env abort inform is_quosure quo_get_expr quo_is_null env_get env_bind env_has quo_is_missing quos enexprs -#' @importFrom rlang call_modify call_standardise call_name call_args is_call current_env quo_name trace_back is_function +#' @importFrom rlang call_modify call_name call_args is_call current_env quo_name trace_back is_function list2 #' @importFrom rlang expr exprs enexprs enexpr is_named env_parent env_label is_logical is_empty is_quosures quo_is_symbol sym syms := as_name #' @importFrom rlang quos quo env_names env_bind_active as_label eval_tidy warn quo_is_call -#' @importFrom stringr str_split str_extract_all regex str_detect str_replace_all str_replace str_locate_all fixed str_count str_trim str_wrap -#' @importFrom purrr flatten map map_lgl pmap_chr imap reduce map_chr map_int map_dbl map_dfr pmap_dfr walk2 map2 map2_dfr walk -#' @importFrom stringr str_sub str_sub<- str_extract str_pad str_starts str_remove_all str_match_all +#' @importFrom stringr str_split str_extract_all regex str_detect str_replace_all str_replace str_locate_all fixed str_count str_trim str_wrap str_count +#' @importFrom purrr flatten map map_lgl pmap_chr imap reduce map_chr map_int map_dbl map_dfr pmap_dfr walk2 map2 map2_dfr map2_chr walk +#' @importFrom stringr str_sub str_sub<- str_extract str_pad str_starts str_remove_all str_match_all str_match #' @importFrom tidyr pivot_longer pivot_wider replace_na #' @importFrom magrittr %>% extract extract2 #' @importFrom assertthat assert_that #' @importFrom stats IQR median sd quantile var #' @importFrom dplyr n summarize filter vars tally ungroup group_by mutate lag select bind_rows full_join add_tally distinct rowwise #' @importFrom dplyr everything rename mutate_at mutate_all as_tibble bind_cols do case_when arrange left_join row_number between mutate_if -#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches +#' @importFrom dplyr across anti_join n_distinct if_else group_keys cur_group cur_column pull matches slice_head where desc count #' @importFrom tidyr complete nesting pivot_wider pivot_longer replace_na starts_with fill #' @importFrom utils str head tail #' @importFrom tidyselect all_of vars_select any_of @@ -199,7 +199,6 @@ display_string <- NULL built_target <- NULL table_where <- NULL distinct_by <- NULL -distinct_stat <- NULL summary_vars <- NULL trans_vars <- NULL stat <- NULL @@ -277,7 +276,6 @@ missing_sort_value <- NULL missing_index <- NULL total_index <- NULL process_distinct_total <- FALSE -total_stat_denom <- NULL denom_where <- NULL built_target_pre_where <- NULL count_fmt <- NULL @@ -303,3 +301,18 @@ l <- NULL w <- NULL s <- NULL out <- NULL +og_row <- NULL +desc <- NULL +id <- NULL +stub_sort <- NULL +include_missing_subjects_row <- NULL +missing_subjects_row_label <- NULL +missing_subjects_stat <- NULL +missing_subjects_count_format <- NULL +missing_subject_rows <- NULL +missing_subject_vars_ord <- NULL +string_ms <- NULL +missing_subjects_sort_value <- NULL +limit_data_by <- NULL +n_present <- NULL +header_tots <- NULL diff --git a/README.Rmd b/README.Rmd index 5f1c058c..e7d0eee2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,11 +17,9 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("vignettes/adae.Rdata") -load("vignettes/adsl.Rdata") ``` -# *Tplyr* +# **Tplyr** [](https://pharmaverse.org) @@ -50,7 +48,7 @@ install.packages("Tplyr") devtools::install_github("https://github.com/atorus-research/Tplyr.git", ref="devel") ``` -# What is *Tplyr*? +# What is **Tplyr**? [dplyr](https://dplyr.tidyverse.org/) from tidyverse is a grammar of data manipulation. So what does that allow you to do? It gives you, as a data analyst, the capability to easily and intuitively approach the problem of manipulating your data into an analysis ready form. [dplyr](https://dplyr.tidyverse.org/) conceptually breaks things down into verbs that allow you to focus on _what_ you want to do more than _how_ you have to do it. @@ -76,11 +74,10 @@ When you look at this table, you can begin breaking this output down into smalle So we have one table, with 6 summaries (7 including the next page, not shown) - but only 2 different approaches to summaries being performed. In the same way that [dplyr](https://dplyr.tidyverse.org/) is a grammar of data manipulation, **Tplyr** aims to be a grammar of data summary. The goal of **Tplyr** is to allow you to program a summary table like you see it on the page, by breaking a larger problem into smaller 'layers', and combining them together like you see on the page. -Enough talking - let's see some code. In these examples, we will be using data from the [PHUSE Test Data Factory]( https://advance.phuse.global/display/WEL/Test+Dataset+Factory) based on the [original pilot project submission package](https://github.com/atorus-research/CDISC_pilot_replication). Note: You can see our replication of the CDISC pilot using the PHUSE Test Data Factory data [here](https://github.com/atorus-research/CDISC_pilot_replication). +Enough talking - let's see some code. In these examples, we will be using data from the [PHUSE Test Data Factory]( https://advance.phuse.global/display/WEL/Test+Dataset+Factory) based on the [original pilot project submission package](https://github.com/atorus-research/CDISC_pilot_replication). We've packaged some subsets of that data into **Tplyr**, which you can use to replicate our examples and run our vignette code yourself. Note: You can see our replication of the CDISC pilot using the PHUSE Test Data Factory data [here](https://github.com/atorus-research/CDISC_pilot_replication). ```{r initial_demo} - -tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% +tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% add_layer( group_desc(AGE, by = "Age (years)") ) %>% @@ -89,10 +86,9 @@ tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% ) %>% build() %>% kable() - ``` -## *Tplyr* is Qualified +## **Tplyr** is Qualified We understand how important documentation and testing is within the pharmaceutical world. This is why outside of unit testing **Tplyr** includes an entire user-acceptance testing document, where requirements were established, test-cases were written, and tests were independently programmed and executed. We do this in the hope that you can leverage our work within a qualified programming environment, and that we save you a substantial amount of trouble in getting it there. diff --git a/README.md b/README.md index 98d0c81a..31fcf75a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# Tplyr +# **Tplyr** @@ -42,7 +42,7 @@ install.packages("Tplyr") devtools::install_github("https://github.com/atorus-research/Tplyr.git", ref="devel") ``` -# What is Tplyr? +# What is **Tplyr**? [dplyr](https://dplyr.tidyverse.org/) from tidyverse is a grammar of data manipulation. So what does that allow you to do? It gives you, as a @@ -58,10 +58,10 @@ pharmaceutical industry, a great deal of the data presented in the outputs we create are very similar. For the most part, most of these tables can be broken down into a few categories: -- Counting for event based variables or categories -- Shifting, which is just counting a change in state with a ‘from’ and - a ‘to’ -- Generating descriptive statistics around some continuous variable. +- Counting for event based variables or categories +- Shifting, which is just counting a change in state with a ‘from’ and a + ‘to’ +- Generating descriptive statistics around some continuous variable. For many of the tables that go into a clinical submission, the tables are made up of a combination of these approaches. Consider a @@ -81,15 +81,15 @@ into smaller, redundant, components. These components can be viewed as layers. The boxes in the image above represent how you can begin to conceptualize this. -- First we have Sex, which is made up of n (%) counts. -- Next we have Age as a continuous variable, where we have a number of - descriptive statistics, including n, mean, standard deviation, - median, quartile 1, quartile 3, min, max, and missing values. -- After that we have age, but broken into categories - so this is once - again n (%) values. -- Race - more counting, -- Ethnicity - more counting -- Weight - and we’re back to descriptive statistics. +- First we have Sex, which is made up of n (%) counts. +- Next we have Age as a continuous variable, where we have a number of + descriptive statistics, including n, mean, standard deviation, median, + quartile 1, quartile 3, min, max, and missing values. +- After that we have age, but broken into categories - so this is once + again n (%) values. +- Race - more counting, +- Ethnicity - more counting +- Weight - and we’re back to descriptive statistics. So we have one table, with 6 summaries (7 including the next page, not shown) - but only 2 different approaches to summaries being performed. @@ -104,13 +104,14 @@ using data from the [PHUSE Test Data Factory](https://advance.phuse.global/display/WEL/Test+Dataset+Factory) based on the [original pilot project submission package](https://github.com/atorus-research/CDISC_pilot_replication). -Note: You can see our replication of the CDISC pilot using the PHUSE -Test Data Factory data +We’ve packaged some subsets of that data into **Tplyr**, which you can +use to replicate our examples and run our vignette code yourself. Note: +You can see our replication of the CDISC pilot using the PHUSE Test Data +Factory data [here](https://github.com/atorus-research/CDISC_pilot_replication). ``` r - -tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% +tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% add_layer( group_desc(AGE, by = "Age (years)") ) %>% @@ -133,7 +134,7 @@ tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% | Age Categories n (%) | \>80 | 30 ( 34.9%) | 18 ( 21.4%) | 29 ( 34.5%) | 2 | 1 | 2 | | Age Categories n (%) | 65-80 | 42 ( 48.8%) | 55 ( 65.5%) | 47 ( 56.0%) | 2 | 1 | 3 | -## Tplyr is Qualified +## **Tplyr** is Qualified We understand how important documentation and testing is within the pharmaceutical world. This is why outside of unit testing **Tplyr** @@ -153,38 +154,38 @@ this report. Here are some of the high level benefits of using **Tplyr**: -- Easy construction of table data using an intuitive syntax -- Smart string formatting for your numbers that’s easily specified by - the user -- A great deal of flexibility in what is performed and how it’s - presented, without specifying hundreds of parameters +- Easy construction of table data using an intuitive syntax +- Smart string formatting for your numbers that’s easily specified by + the user +- A great deal of flexibility in what is performed and how it’s + presented, without specifying hundreds of parameters # Where to go from here? There’s quite a bit more to learn! And we’ve prepared a number of other vignettes to help you get what you need out of **Tplyr**. -- The best place to start is with our Getting Started vignette at - `vignette("Tplyr")` -- Learn more about table level settings in `vignette("table")` -- Learn more about descriptive statistics layers in `vignette("desc")` -- Learn more about count layers in `vignette("count")` -- Learn more about shift layers in `vignette("shift")` -- Learn more about percentages in `vignette("denom")` -- Learn more about calculating risk differences in - `vignette("riskdiff")` -- Learn more about sorting **Tplyr** tables in `vignette("sort")` -- Learn more about using **Tplyr** options in `vignette("options")` -- And finally, learn more about producing and outputting styled tables - using **Tplyr** in `vignette("styled-table")` +- The best place to start is with our Getting Started vignette at + `vignette("Tplyr")` +- Learn more about table level settings in `vignette("table")` +- Learn more about descriptive statistics layers in `vignette("desc")` +- Learn more about count layers in `vignette("count")` +- Learn more about shift layers in `vignette("shift")` +- Learn more about percentages in `vignette("denom")` +- Learn more about calculating risk differences in + `vignette("riskdiff")` +- Learn more about sorting **Tplyr** tables in `vignette("sort")` +- Learn more about using **Tplyr** options in `vignette("options")` +- And finally, learn more about producing and outputting styled tables + using **Tplyr** in `vignette("styled-table")` In the **Tplyr** version 1.0.0, we’ve packed a number of new features in. For deeper dives on the largest new additions: -- Learn about **Tplyr’s** traceability metadata in - `vignette("metadata")` and about how it can be extended in - `vignette("custom-metadata")` -- Learn about layer templates in `vignette("layer_templates")` +- Learn about **Tplyr**’s traceability metadata in + `vignette("metadata")` and about how it can be extended in + `vignette("custom-metadata")` +- Learn about layer templates in `vignette("layer_templates")` # References diff --git a/_pkgdown.yml b/_pkgdown.yml index b638c118..380b4141 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -62,7 +62,9 @@ reference: - add_total_row - add_total_group - add_risk_diff + - add_missing_subjects_row - set_total_row_label + - set_missing_subjects_row_label - title: Descriptive Statistics Layer Functions desc: Descriptive statistics layer helper functions - contents: @@ -84,6 +86,7 @@ reference: - set_denom_ignore - set_indentation - set_numeric_threshold + - set_limit_data_by - title: Column Headers desc: Column header helpers - contents: @@ -95,6 +98,7 @@ reference: - tplyr_meta - add_variables - add_filters + - add_anti_join - get_metadata - append_metadata - starts_with('get_meta') @@ -105,12 +109,14 @@ reference: - title: Post-pocessing desc: Post-pocessing functions - contents: - - str_indent_wrap - - apply_row_masks - apply_conditional_format + - apply_formats + - apply_row_masks + - collapse_row_labels + - replace_leading_whitespace - str_extract_fmt_group - str_extract_num - - apply_formats + - str_indent_wrap - title: Helper functions desc: General helper functions - contents: @@ -122,6 +128,15 @@ reference: - get_where.tplyr_layer - Tplyr - get_tplyr_regex +- title: Data + desc: Tplyr Built-in Datasets +- contents: + - tplyr_adae + - tplyr_adas + - tplyr_adlb + - tplyr_adsl + - tplyr_adpe + - get_data_labels articles: - title: Table Basics diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index d0a7458e..00000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Starter pipeline -# Start with a minimal pipeline that you can customize to build and deploy your code. -# Add steps that build, run tests, deploy, and more: -# https://aka.ms/yaml - -# parameters: -# - name: tidyverse_version -# displayName: Tidyverse Version -# type: string -# default: 'rocker/tidyverse:latest' -# values: -# - 'rocker/tidyverse:latest' -# - rocker/tidyverse:3.6.3 -# - rocker/tidyverse:3.6.2 -# - rocker/tidyverse:3.6.1 -# - rocker/tidyverse:3.6.0 -# - rocker/tidyverse:3.5.3 -# - rocker/tidyverse:3.5.2 -# - rocker/tidyverse:3.5.1 -# - rocker/tidyverse:3.5.0 -# - rocker/tidyverse:3.4.4 -# - rocker/tidyverse:3.4.3 -# - rocker/tidyverse:3.4.2 -# - rocker/tidyverse:3.4.1 -# - rocker/tidyverse:3.4.0 -# - rocker/tidyverse:3.3.3 -# - rocker/tidyverse:3.3.2 -# - rocker/tidyverse:3.3.1 - -trigger: none - -pool: - vmImage: 'ubuntu-latest' - -container: 'rocker/tidyverse:latest' - -steps: - -- script: sudo Rscript -e 'install.packages("huxtable"); devtools::check(cran = FALSE)' - displayName: 'Package Check' - continueOnError: true - -- script: Rscript -e 'sessionInfo()' - displayName: 'R Version' diff --git a/cran-comments.md b/cran-comments.md index 00720ad0..a35c0f51 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,9 +1,9 @@ -## Submission 1.0.2 -* Bug fix identified in Tplyr 1.0.1 +## Submission 1.2.1 +* Added metadata handling for features introduced in 1.2.0, added function `add_anti_join()` ## Test Environments -* Local Ubuntu 18.04.4 devtools::check +* Local Ubuntu 22.04.3 devtools::check * Latest Ubuntu CI with latest tidyverse * Github release action with windows, linux, and osx check * RHub Check diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R new file mode 100644 index 00000000..6514a944 --- /dev/null +++ b/data-raw/DATASET.R @@ -0,0 +1,3 @@ +## code to prepare `DATASET` dataset goes here + +usethis::use_data(DATASET, overwrite = TRUE) diff --git a/data-raw/adae.R b/data-raw/adae.R new file mode 100644 index 00000000..5c187c73 --- /dev/null +++ b/data-raw/adae.R @@ -0,0 +1,6 @@ +# note: adae.Rdata was copied over from vignettes/adsl.Rdata +# this is a copy of the PHUSE Test Data Factory data, trimmed down for size + +load("data-raw/adae.Rdata") +tplyr_adae <- adae +usethis::use_data(tplyr_adae, overwrite = TRUE) diff --git a/vignettes/adae.Rdata b/data-raw/adae.Rdata similarity index 100% rename from vignettes/adae.Rdata rename to data-raw/adae.Rdata diff --git a/data-raw/adas.R b/data-raw/adas.R new file mode 100644 index 00000000..5222a90a --- /dev/null +++ b/data-raw/adas.R @@ -0,0 +1,6 @@ +# note: adlb.Rdata was copied over from vignettes/adsl.Rdata +# this is a copy of the PHUSE Test Data Factory data, trimmed down for size + +load("data-raw/adas.Rdata") +tplyr_adas <- adas +usethis::use_data(tplyr_adas, overwrite = TRUE) diff --git a/vignettes/adas.Rdata b/data-raw/adas.Rdata similarity index 100% rename from vignettes/adas.Rdata rename to data-raw/adas.Rdata diff --git a/data-raw/adlb.R b/data-raw/adlb.R new file mode 100644 index 00000000..df2972ea --- /dev/null +++ b/data-raw/adlb.R @@ -0,0 +1,6 @@ +# note: adlb.Rdata was copied over from vignettes/adsl.Rdata +# this is a copy of the PHUSE Test Data Factory data, trimmed down for size + +load("data-raw/adlb.Rdata") +tplyr_adlb <- adlb +usethis::use_data(tplyr_adlb, overwrite = TRUE) diff --git a/vignettes/adlb.Rdata b/data-raw/adlb.Rdata similarity index 100% rename from vignettes/adlb.Rdata rename to data-raw/adlb.Rdata diff --git a/data-raw/adpe.R b/data-raw/adpe.R new file mode 100644 index 00000000..d80860eb --- /dev/null +++ b/data-raw/adpe.R @@ -0,0 +1,31 @@ +# This adpe dataset is just a mock-up that's fit for purpose to test and demonstrate data limiting +tplyr_adpe <- tibble::tribble(~USUBJID, ~AVISIT, ~PECAT, ~PARAM, ~TRT01A, ~AVALC, ~AVAL, ~BASEC, + "101-001", "Screening", "A", "Head", "TRT A", "Normal", 1, "Abnormal", + "101-001", "Screening", "A", "Lungs", "TRT A", "Normal", 2, "Semi-Normal", + "101-001", "Day -1", "A", "Lungs", "TRT A", "Normal", 3, "Normal", + "101-001", "Day 5", "A", "Lungs", "TRT A", "Normal", 4, "Normal", + "101-002", "Screening", "A", "Head", "TRT B", "Semi-Normal", 5, "Normal", + "101-002", "Screening", "A", "Lungs", "TRT B", "Normal", 6, "Normal", + "101-002", "Day -1", "A", "Lungs", "TRT B", "Normal", 7, "Normal", + "101-002", "Day 5", "A", "Lungs", "TRT B", "Normal", 8, "Semi-Normal", + "101-003", "Screening", "A", "Head", "TRT A", "Normal", 9, "Normal", + "101-003", "Screening", "A", "Lungs", "TRT A", "Abnormal", 10, "Abnormal", + "101-003", "Day -1", "A", "Lungs", "TRT A", "Normal", 11, "Abnormal", + "101-003", "Day 5", "A", "Lungs", "TRT A", "Abnormal", 12, "Abnormal", + + "101-001", "Screening", "B", "Lungs", "TRT A", "Normal", 13, "Normal", + "101-001", "Day -1", "B", "Lungs", "TRT A", "Normal", 14, "Normal", + "101-001", "Day 5", "B", "Lungs", "TRT A", "Normal", 15, "Semi-Normal", + "101-002", "Screening", "B", "Lungs", "TRT B", "Normal", 16, "Normal", + "101-002", "Day -1", "B", "Lungs", "TRT B", "Normal", 17, "Abnormal", + "101-002", "Day 5", "B", "Lungs", "TRT B", "Normal", 18, "Abnormal", + "101-003", "Screening", "B", "Lungs", "TRT A", "Abnormal", 19, "Normal", + "101-003", "Day -1", "B", "Lungs", "TRT A", "Normal", 20, "Normal", + "101-003", "Day 5", "B", "Lungs", "TRT A", "Abnormal", 21, "Normal" +) + +tplyr_adpe$AVALC <- factor(tplyr_adpe$AVALC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$BASEC <- factor(tplyr_adpe$BASEC, levels = c("Normal", "Semi-Normal", "Abnormal")) +tplyr_adpe$AVISIT <- factor(tplyr_adpe$AVISIT, levels = c("Screening", "Day -1", "Day 5")) + +usethis::use_data(tplyr_adpe, overwrite = TRUE) diff --git a/data-raw/adsl.R b/data-raw/adsl.R new file mode 100644 index 00000000..a56cf66f --- /dev/null +++ b/data-raw/adsl.R @@ -0,0 +1,6 @@ +# note: adsl.Rdata was copied over from vignettes/adsl.Rdata +# this is a copy of the PHUSE Test Data Factory data, trimmed down for size + +load("data-raw/adsl.Rdata") +tplyr_adsl <- adsl +usethis::use_data(tplyr_adsl, overwrite = TRUE) diff --git a/vignettes/adsl.Rdata b/data-raw/adsl.Rdata similarity index 100% rename from vignettes/adsl.Rdata rename to data-raw/adsl.Rdata diff --git a/data/tplyr_adae.rda b/data/tplyr_adae.rda new file mode 100644 index 00000000..5d6c868a Binary files /dev/null and b/data/tplyr_adae.rda differ diff --git a/data/tplyr_adas.rda b/data/tplyr_adas.rda new file mode 100644 index 00000000..789d372a Binary files /dev/null and b/data/tplyr_adas.rda differ diff --git a/data/tplyr_adlb.rda b/data/tplyr_adlb.rda new file mode 100644 index 00000000..d9e628c9 Binary files /dev/null and b/data/tplyr_adlb.rda differ diff --git a/data/tplyr_adpe.rda b/data/tplyr_adpe.rda new file mode 100644 index 00000000..58557959 Binary files /dev/null and b/data/tplyr_adpe.rda differ diff --git a/data/tplyr_adsl.rda b/data/tplyr_adsl.rda new file mode 100644 index 00000000..ab16e055 Binary files /dev/null and b/data/tplyr_adsl.rda differ diff --git a/man/Tplyr.Rd b/man/Tplyr.Rd index 1a923391..06706f33 100644 --- a/man/Tplyr.Rd +++ b/man/Tplyr.Rd @@ -125,6 +125,9 @@ Other contributors: \itemize{ \item Nathan Kosiba \email{Nathan.Kosiba@atorusresearch.com} (\href{https://orcid.org/0000-0001-5359-4234}{ORCID}) [contributor] \item Sadchla Mascary \email{sadchla.mascary@atorusresearch.com} [contributor] + \item Andrew Bates \email{andrew.bates@atorusresearch.com} [contributor] + \item Shiyu Chen \email{shiyu.chen@atorusresearch.com} [contributor] + \item Oleksii Mikryukov \email{alex.mikryukov@atorusresearch.com} [contributor] \item Atorus Research LLC [copyright holder] } diff --git a/man/add_anti_join.Rd b/man/add_anti_join.Rd new file mode 100644 index 00000000..5d0bc5ca --- /dev/null +++ b/man/add_anti_join.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/meta.R +\name{add_anti_join} +\alias{add_anti_join} +\title{Add an anti-join onto a tplyr_meta object} +\usage{ +add_anti_join(meta, join_meta, on) +} +\arguments{ +\item{meta}{A tplyr_meta object referring to the target data} + +\item{join_meta}{A tplyr_meta object referring to the population data} + +\item{on}{A list of quosures containing symbols - most likely set to USUBJID.} +} +\value{ +A tplyr_meta object +} +\description{ +An anti-join allows a tplyr_meta object to refer to data that should be +extracted from a separate dataset, like the population data of a Tplyr table, +that is unavailable in the target dataset. The primary use case for this is +the presentation of missing subjects, which in a Tplyr table is presented +using the function \code{add_missing_subjects_row()}. The missing subjects +themselves are not present in the target data, and are thus only available in +the population data. The \code{add_anti_join()} function allows you to provide the +meta information relevant to the population data, and then specify the \code{on} +variable that should be used to join with the target dataset and find the +values present in the population data that are missing from the target data. +} +\examples{ + +tm <- tplyr_meta( + rlang::quos(TRT01A, SEX, ETHNIC, RACE), + rlang::quos(TRT01A == "Placebo", TRT01A == "SEX", ETHNIC == "HISPANIC OR LATINO") +) + +tm \%>\% + add_anti_join( + tplyr_meta( + rlang::quos(TRT01A, ETHNIC), + rlang::quos(TRT01A == "Placebo", ETHNIC == "HISPANIC OR LATINO") + ), + on = rlang::quos(USUBJID) + ) +} diff --git a/man/add_missing_subjects_row.Rd b/man/add_missing_subjects_row.Rd new file mode 100644 index 00000000..55769f5a --- /dev/null +++ b/man/add_missing_subjects_row.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{add_missing_subjects_row} +\alias{add_missing_subjects_row} +\title{Add a missing subject row into a count summary.} +\usage{ +add_missing_subjects_row(e, fmt = NULL, sort_value = NULL) +} +\arguments{ +\item{e}{A `count_layer` object} + +\item{fmt}{An f_str object used to format the total row. If none is provided, +display is based on the layer formatting.} + +\item{sort_value}{The value that will appear in the ordering column for total +rows. This must be a numeric value.} +} +\description{ +This function calculates the number of subjects missing from a particular +group of results. The calculation is done by examining the total number of +subjects potentially available from the Header N values within the result +column, and finding the difference with the total number of subjects present +in the result group. Note that for accurate results, the subject variable +needs to be defined using the `set_distinct_by()` function. As with other +methods, this function instructs how distinct results should be identified. +} +\examples{ + +tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row(f_str("xxxx", n)) + ) \%>\% + build() +} diff --git a/man/add_total_row.Rd b/man/add_total_row.Rd index c9ca5f97..102423db 100644 --- a/man/add_total_row.Rd +++ b/man/add_total_row.Rd @@ -33,6 +33,10 @@ total and the application of denominators becomes ambiguous. You will be warned specifically if a percent is included in the format. To rectify this, use \code{set_denoms_by()}, and the grouping of \code{add_total_row()} will be updated accordingly. + +Note that when using \code{add_total_row()} with \code{set_pop_data()}, you +should call \code{add_total_row()} AFTER calling \code{set_pop_data()}, +otherwise there is potential for unexpected behaivior with treatment groups. } \examples{ # Load in Pipe diff --git a/man/collapse_row_labels.Rd b/man/collapse_row_labels.Rd new file mode 100644 index 00000000..fd144a59 --- /dev/null +++ b/man/collapse_row_labels.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collapse_row_labels.R +\name{collapse_row_labels} +\alias{collapse_row_labels} +\title{Collapse row labels into a single column} +\usage{ +collapse_row_labels(x, ..., indent = " ", target_col = row_label) +} +\arguments{ +\item{x}{Input data frame} + +\item{...}{Row labels to be collapsed} + +\item{indent}{Indentation string to be used, which is multiplied at each indentation level} + +\item{target_col}{The desired name of the output column containing collapsed row labels} +} +\value{ +data.frame with row labels collapsed into a single column +} +\description{ +This is a generalized post processing function that allows you to take groups +of by variables and collapse them into a single column. Repeating values are +split into separate rows, and for each level of nesting, a specified +indentation level can be applied. +} +\examples{ +x <- tibble::tribble( +~row_label1, ~row_label2, ~row_label3, ~row_label4, ~var1, + "A", "C", "G", "M", 1L, + "A", "C", "G", "N", 2L, + "A", "C", "H", "O", 3L, + "A", "D", "H", "P", 4L, + "A", "D", "I", "Q", 5L, + "A", "D", "I", "R", 6L, + "B", "E", "J", "S", 7L, + "B", "E", "J", "T", 8L, + "B", "E", "K", "U", 9L, + "B", "F", "K", "V", 10L, + "B", "F", "L", "W", 11L +) + + +collapse_row_labels(x, row_label1, row_label2, row_label3, row_label4) + +collapse_row_labels(x, row_label1, row_label2, row_label3) + +collapse_row_labels(x, row_label1, row_label2, indent = " ", target_col = rl) + +} diff --git a/man/get_data_labels.Rd b/man/get_data_labels.Rd new file mode 100644 index 00000000..56ba91c6 --- /dev/null +++ b/man/get_data_labels.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{get_data_labels} +\alias{get_data_labels} +\title{Get Data Labels} +\usage{ +get_data_labels(data) +} +\arguments{ +\item{data}{A Tplyr data set.} +} +\value{ +A data.frame with columns `name` and `label` containing the names and labels of each column. +} +\description{ +Get labels for data sets included in Tplyr. +} diff --git a/man/get_meta_subset.Rd b/man/get_meta_subset.Rd index f8028394..725892e0 100644 --- a/man/get_meta_subset.Rd +++ b/man/get_meta_subset.Rd @@ -14,6 +14,7 @@ get_meta_subset(x, row_id, column, add_cols = vars(USUBJID), ...) column, add_cols = vars(USUBJID), target = NULL, + pop_data = NULL, ... ) @@ -32,6 +33,9 @@ string} \item{...}{additional arguments} \item{target}{A data frame to be subset (if not pulled from a Tplyr table)} + +\item{pop_data}{A data frame to be subset through an anti-join (if not pulled +from a Tplyr table)} } \value{ A data.frame diff --git a/man/layer_templates.Rd b/man/layer_templates.Rd index 329003dc..ff17203e 100644 --- a/man/layer_templates.Rd +++ b/man/layer_templates.Rd @@ -30,7 +30,7 @@ target, by, and where parameters.} \item{add_params}{Additional parameters passed into layer modifier functions. These arguments are specified in a template within curly brackets such as -{param}. Supply as a named list, where the element name is the parameter.} +\{param\}. Supply as a named list, where the element name is the parameter.} } \description{ There are several scenarios where a layer template may be useful. Some diff --git a/man/pop_data.Rd b/man/pop_data.Rd index 349a2d73..305553f3 100644 --- a/man/pop_data.Rd +++ b/man/pop_data.Rd @@ -39,4 +39,6 @@ tab <- tplyr_table(iris, Species) pop_data(tab) <- mtcars +tab <- tplyr_table(iris, Species) \%>\% + set_pop_data(mtcars) } diff --git a/man/replace_leading_whitespace.Rd b/man/replace_leading_whitespace.Rd new file mode 100644 index 00000000..cde36c59 --- /dev/null +++ b/man/replace_leading_whitespace.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/replace_leading_whitespace.R +\name{replace_leading_whitespace} +\alias{replace_leading_whitespace} +\title{Reformat strings with leading whitespace for HTML} +\usage{ +replace_leading_whitespace(x, tab_width = 4) +} +\arguments{ +\item{x}{Target string} + +\item{tab_width}{Number of spaces to compensate for tabs} +} +\value{ +String with   replaced for leading whitespace +} +\description{ +Reformat strings with leading whitespace for HTML +} +\examples{ +x <- c(" Hello there", " Goodbye Friend ", "\tNice to meet you", +" \t What are you up to? \t \t ") +replace_leading_whitespace(x) + +replace_leading_whitespace(x, tab=2) + +} diff --git a/man/set_limit_data_by.Rd b/man/set_limit_data_by.Rd new file mode 100644 index 00000000..c9416a76 --- /dev/null +++ b/man/set_limit_data_by.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/set_limit_data_by.R +\name{set_limit_data_by} +\alias{set_limit_data_by} +\title{Set variables to limit reported data values only to those that exist rather +than fully completing all possible levels} +\usage{ +set_limit_data_by(e, ...) +} +\arguments{ +\item{e}{A tplyr_layer} + +\item{...}{Subset of variables within by or target variables} +} +\value{ +a tplyr_table +} +\description{ +This function allows you to select a combination of by variables or +potentially target variables for which you only want to display values +present in the data. By default, Tplyr will create a cartesian combination of +potential values of the data. For example, if you have 2 by variables +present, then each potential combination of those by variables will have a +row present in the final table. \code{set_limit_data_by()} allows you to choose +the by variables whose combination you wish to limit to values physically +present in the available data. +} +\examples{ + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PARAM, AVISIT) + ) \%>\% + build() + +tplyr_table(tplyr_adpe, TRT01A) \%>\% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) \%>\% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) \%>\% + build() +} diff --git a/man/set_missing_subjects_row_label.Rd b/man/set_missing_subjects_row_label.Rd new file mode 100644 index 00000000..bcc943a1 --- /dev/null +++ b/man/set_missing_subjects_row_label.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/count_bindings.R +\name{set_missing_subjects_row_label} +\alias{set_missing_subjects_row_label} +\title{Set the label for the missing subjects row} +\usage{ +set_missing_subjects_row_label(e, missing_subjects_row_label) +} +\arguments{ +\item{e}{A \code{count_layer} object} + +\item{missing_subjects_row_label}{A character to label the total row} +} +\value{ +The modified \code{count_layer} object +} +\description{ +Set the label for the missing subjects row +} +\examples{ + +t <- tplyr_table(mtcars, gear) \%>\% + add_layer( + group_count(cyl) \%>\% + add_missing_subjects_row() \%>\% + set_missing_subjects_row_label("Missing") + ) +build(t) +} diff --git a/man/tplyr_adae.Rd b/man/tplyr_adae.Rd new file mode 100644 index 00000000..409877da --- /dev/null +++ b/man/tplyr_adae.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adae} +\alias{tplyr_adae} +\title{ADAE Data} +\format{ +A data.frame with 276 rows and 55 columns. +} +\source{ +https://github.com/phuse-org/TestDataFactory +} +\usage{ +tplyr_adae +} +\description{ +A subset of the PHUSE Test Data Factory ADAE data set. +} +\seealso{ +[get_data_labels()] +} +\keyword{datasets} diff --git a/man/tplyr_adas.Rd b/man/tplyr_adas.Rd new file mode 100644 index 00000000..1d6528bc --- /dev/null +++ b/man/tplyr_adas.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adas} +\alias{tplyr_adas} +\title{ADAS Data} +\format{ +A data.frame with 1,040 rows and 40 columns. +} +\source{ +https://github.com/phuse-org/TestDataFactory +} +\usage{ +tplyr_adas +} +\description{ +A subset of the PHUSE Test Data Factory ADAS data set. +} +\seealso{ +[get_data_labels()] +} +\keyword{datasets} diff --git a/man/tplyr_adlb.Rd b/man/tplyr_adlb.Rd new file mode 100644 index 00000000..10863663 --- /dev/null +++ b/man/tplyr_adlb.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adlb} +\alias{tplyr_adlb} +\title{ADLB Data} +\format{ +A data.frame with 311 rows and 46 columns. +} +\source{ +https://github.com/phuse-org/TestDataFactory +} +\usage{ +tplyr_adlb +} +\description{ +A subset of the PHUSE Test Data Factory ADLB data set. +} +\seealso{ +[get_data_labels()] +} +\keyword{datasets} diff --git a/man/tplyr_adpe.Rd b/man/tplyr_adpe.Rd new file mode 100644 index 00000000..ab19f95d --- /dev/null +++ b/man/tplyr_adpe.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adpe} +\alias{tplyr_adpe} +\title{ADPE Data} +\format{ +A data.frame with 21 rows and 8 columns. +} +\usage{ +tplyr_adpe +} +\description{ +A mock-up dataset that is fit for testing data limiting +} +\keyword{datasets} diff --git a/man/tplyr_adsl.Rd b/man/tplyr_adsl.Rd new file mode 100644 index 00000000..2b17ae5e --- /dev/null +++ b/man/tplyr_adsl.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{tplyr_adsl} +\alias{tplyr_adsl} +\title{ADSL Data} +\format{ +A data.frame with 254 rows and 49 columns. +} +\source{ +https://github.com/phuse-org/TestDataFactory +} +\usage{ +tplyr_adsl +} +\description{ +A subset of the PHUSE Test Data Factory ADSL data set. +} +\seealso{ +[get_data_labels()] +} +\keyword{datasets} diff --git a/man/treat_grps.Rd b/man/treat_grps.Rd index 1ed42a3e..d133bb87 100644 --- a/man/treat_grps.Rd +++ b/man/treat_grps.Rd @@ -15,8 +15,8 @@ treat_grps(table) \arguments{ \item{table}{A \code{tplyr_table} object} -\item{...}{A named vector where names will become the new treatment group names, -and values will be used to construct those treatment groups} +\item{...}{A named vector where names will become the new treatment group +names, and values will be used to construct those treatment groups} \item{group_name}{The treatment group name used for the constructed 'Total' group} } @@ -24,24 +24,30 @@ and values will be used to construct those treatment groups} The modified table object } \description{ -Summary tables often present individual treatment groups, -but may additionally have a "Treatment vs. Placebo" or "Total" group added -to show grouped summary statistics or counts. This set of functions offers -an interface to add these groups at a table level and be consumed by -subsequent layers. +Summary tables often present individual treatment groups, but may +additionally have a "Treatment vs. Placebo" or "Total" group added to show +grouped summary statistics or counts. This set of functions offers an +interface to add these groups at a table level and be consumed by subsequent +layers. } \details{ \code{add_treat_grps} allows you to specify specific groupings. This is done -by supplying named arguments, where the name becomes the new treatment group's -name, and those treatment groups are made up of the argument's values. +by supplying named arguments, where the name becomes the new treatment +group's name, and those treatment groups are made up of the argument's +values. -\code{add_total_group} is a simple wrapper around \code{add_treat_grps}. Instead of -producing custom groupings, it produces a "Total" group by the supplied name, which -defaults to "Total". This "Total" group is made up of all existing treatment -groups within the population dataset. +\code{add_total_group} is a simple wrapper around \code{add_treat_grps}. +Instead of producing custom groupings, it produces a "Total" group by the +supplied name, which defaults to "Total". This "Total" group is made up of +all existing treatment groups within the population dataset. -The function \code{treat_grps} allows you to see the custom treatment groups available -in your \code{tplyr_table} object +Note that when using \code{add_treat_grps} or \code{add_total_row()} with +\code{set_pop_data()}, you should call \code{add_total_row()} AFTER calling +\code{set_pop_data()}, otherwise there is potential for unexpected behaivior +with treatment groups. + +The function \code{treat_grps} allows you to see the custom treatment groups +available in your \code{tplyr_table} object } \examples{ tab <- tplyr_table(iris, Species) diff --git a/tests/testthat/_snaps/count.md b/tests/testthat/_snaps/count.md index 3e73875e..a3c4c87d 100644 --- a/tests/testthat/_snaps/count.md +++ b/tests/testthat/_snaps/count.md @@ -209,11 +209,21 @@ 8 2 ( 50.0%) 0 ( 0.0%) 1 3 1 9 0 ( 0.0%) 0 ( 0.0%) 1 3 2 -# nested count layers will error out if second variable is bigger than the first +# nested count can accept data if second variable is bigger than the first - i In index: 1. - Caused by error: - ! The number of values of your second variable must be greater than the number of levels in your first variable + Code + x + Output + row_label1 row_label2 var1_TRT1 + 1 Antiemetics and antinauseants Antiemetics and antinauseants 1 ( 50.0%) + 2 Antiemetics and antinauseants Promethazine hydrochloride 1 ( 50.0%) + 3 Psycholeptics Psycholeptics 1 ( 50.0%) + 4 Psycholeptics Promethazine hydrochloride 1 ( 50.0%) + var1_TRT2 ord_layer_index ord_layer_1 ord_layer_2 + 1 0 ( 0.0%) 1 1 Inf + 2 0 ( 0.0%) 1 1 1 + 3 1 (100.0%) 1 2 Inf + 4 1 (100.0%) 1 2 1 # set_numeric_threshold works as expected @@ -605,3 +615,33 @@ var1_54_F var1_54_M var1_81_F var1_81_M 1 27 (54.0) [50] 23 (67.6) [34] 17 (42.5) [40] 26 (59.1) [44] +# Error checking for add_missing_subjects_row() + + Argument `fmt` does not inherit "f_str". Classes: character + +--- + + Argument `sort_value` does not inherit "numeric". Classes: character + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + +--- + + Argument `missing_subjects_row_label` must be character. Instead a class of "numeric" was passed. + +--- + + length(missing_subjects_row_label) not equal to 1 + +--- + + Argument `e` does not inherit "count_layer". Classes: tplyr_layer, desc_layer, environment + +# Missing counts on nested count layers function correctly + + Population data was not set separately from the target data. + Missing subject counts may be misleading in this scenario. + Did you mean to use `set_missing_count() instead? + diff --git a/tests/testthat/_snaps/data.md b/tests/testthat/_snaps/data.md new file mode 100644 index 00000000..0c9a6d34 --- /dev/null +++ b/tests/testthat/_snaps/data.md @@ -0,0 +1,80 @@ +# get_data_labels + + Code + get_data_labels(tplyr_adsl) + Output + # A tibble: 49 x 2 + name label + + 1 STUDYID Study Identifier + 2 USUBJID Unique Subject Identifier + 3 SUBJID Subject Identifier for the Study + 4 SITEID Study Site Identifier + 5 SITEGR1 Pooled Site Group 1 + 6 ARM Description of Planned Arm + 7 TRT01P Planned Treatment for Period 01 + 8 TRT01PN Planned Treatment for Period 01 (N) + 9 TRT01A Actual Treatment for Period 01 + 10 TRT01AN Actual Treatment for Period 01 (N) + # i 39 more rows + +--- + + Code + get_data_labels(tplyr_adae) + Output + # A tibble: 55 x 2 + name label + + 1 STUDYID Study Identifier + 2 SITEID Study Site Identifier + 3 USUBJID Unique Subject Identifier + 4 TRTA Actual Treatment + 5 TRTAN Actual Treatment (N) + 6 AGE Age + 7 AGEGR1 Pooled Age Group 1 + 8 AGEGR1N Pooled Age Group 1 (N) + 9 RACE Race + 10 RACEN Race (N) + # i 45 more rows + +--- + + Code + get_data_labels(tplyr_adas) + Output + # A tibble: 40 x 2 + name label + + 1 STUDYID Study Identifier + 2 SITEID Study Site Identifier + 3 SITEGR1 Pooled Site Group 1 + 4 USUBJID Unique Subject Identifier + 5 TRTSDT Date of First Exposure to Treatment + 6 TRTEDT Date of Last Exposure to Treatment + 7 TRTP Planned Treatment + 8 TRTPN Planned Treatment (N) + 9 AGE Age + 10 AGEGR1 Pooled Age Group 1 + # i 30 more rows + +--- + + Code + get_data_labels(tplyr_adlb) + Output + # A tibble: 46 x 2 + name label + + 1 STUDYID Study Identifier + 2 SUBJID Subject Identifier for the Study + 3 USUBJID Unique Subject Identifier + 4 TRTP Planned Treatment + 5 TRTPN Planned Treatment (N) + 6 TRTA Actual Treatment + 7 TRTAN Actual Treatment (N) + 8 TRTSDT Date of First Exposure to Treatment + 9 TRTEDT Date of Last Exposure to Treatment + 10 AGE Age + # i 36 more rows + diff --git a/tests/testthat/_snaps/layer.md b/tests/testthat/_snaps/layer.md index 6d6e847c..25bb92df 100644 --- a/tests/testthat/_snaps/layer.md +++ b/tests/testthat/_snaps/layer.md @@ -20,7 +20,7 @@ # Parent must be a `tplyr_table`, `tplyr_layer`, or `tplyr_subgroup_layer` - Must provide `tplyr_table`, `tplyr_layer`, or `tplyr_subgroup_layer` object from the `tplyr` package. + Must provide `tplyr_table` object from the `Tplyr` package. # `by` must me a string, a variable name, or multiple variables submitted using `dplyr::vars` diff --git a/tests/testthat/_snaps/layering.md b/tests/testthat/_snaps/layering.md index 4ce7816b..b2a93ce4 100644 --- a/tests/testthat/_snaps/layering.md +++ b/tests/testthat/_snaps/layering.md @@ -8,7 +8,7 @@ # Parent argument is a valid class (pass through to `tplyr_layer`) - Must provide `tplyr_table`, `tplyr_layer`, or `tplyr_subgroup_layer` object from the `tplyr` package. + Must provide `tplyr_table` object from the `Tplyr` package. # Only `Tplyr` methods are allowed in the `layer` parameter diff --git a/tests/testthat/_snaps/meta.md b/tests/testthat/_snaps/meta.md index 19c02cb6..3103c329 100644 --- a/tests/testthat/_snaps/meta.md +++ b/tests/testthat/_snaps/meta.md @@ -6,6 +6,14 @@ meta must be a tplyr_meta object +--- + + meta must be a tplyr_meta object + +--- + + join_meta must be a tplyr_meta object + --- Filters must be provided as a list of calls @@ -22,6 +30,10 @@ Names must be provided as a list of names +--- + + on must be provided as a list of names + # Metadata extraction and extension error properly t must be a tplyr_table object @@ -75,7 +87,38 @@ Output tplyr_meta: 3 names, 4 filters Names: - a, b, c + a, b, c + Filters: + a == 1, b == 2, c == 3, x == "a" + +# Anti-join extraction works properly + + Population data was not set separately from the target data. + Missing subject counts may be misleading in this scenario. + Did you mean to use `set_missing_count() instead? + +--- + + The `on` variable specified is missing from either the target data or the population data subsets. + Try adding the `on` variables to the `add_cols` parameter + +# Tplyr meta print method works as expected + + Code + print(meta2) + Output + tplyr_meta: 11 names, 5 filters + Names: + TRTP, EFFFL, ITTFL, ANL01FL, SITEGR1, AVISIT, AVISITN, PARAMCD, AVAL, BASE, CHG Filters: - a == 1, b == 2, c == 3, x == "a" + EFFFL == "Y", ITTFL == "Y", PARAMCD == "ACTOT", ANL01FL == "Y", AVISITN == 24 + Anti-join: + Join Meta: + tplyr_meta: 4 names, 2 filters + Names: + TRT01P, EFFFL, ITTFL, SITEGR1 + Filters: + EFFFL == "Y", ITTFL == "Y" + On: + USUBJID diff --git a/tests/testthat/test-apply_conditional_format.R b/tests/testthat/test-apply_conditional_format.R index a2b2f928..6740d04d 100644 --- a/tests/testthat/test-apply_conditional_format.R +++ b/tests/testthat/test-apply_conditional_format.R @@ -4,17 +4,17 @@ test_string2 <- c(" 0 ( 0.0%)", " 8 ( 9.3%)", "78 (90.7%) [ 5]", "12", "Howdy ya test_that("Test input validation and warning generation", { expect_error( apply_conditional_format(c(1), 2, x == 0, "(<1%)", full_string=TRUE), - "Paramter `string`" + "Parameter `string`" ) expect_error( apply_conditional_format(test_string1, "bad", x == 0, "(<1%)", full_string=TRUE), - "Paramter `format_group`" + "Parameter `format_group`" ) expect_error( apply_conditional_format(test_string1, 1.1, x == 0, "(<1%)", full_string=TRUE), - "Paramter `format_group`" + "Parameter `format_group`" ) expect_error( @@ -24,12 +24,12 @@ test_that("Test input validation and warning generation", { expect_error( apply_conditional_format(test_string1, 2, x == 0, 1, full_string=TRUE), - "Paramter `replacement" + "Parameter `replacement" ) expect_error( apply_conditional_format(test_string1, 2, x == 0, "(<1%)", full_string="TRUE"), - "Paramter `full_string`" + "Parameter `full_string`" ) expect_warning( diff --git a/tests/testthat/test-collapse_row_labels.R b/tests/testthat/test-collapse_row_labels.R new file mode 100644 index 00000000..86a34184 --- /dev/null +++ b/tests/testthat/test-collapse_row_labels.R @@ -0,0 +1,70 @@ +dat <- tibble::tribble( + ~row_label1, ~row_label2, ~row_label3, ~row_label4, ~var1, + "A", "C", "G", "M", 1L, + "A", "C", "G", "N", 2L, + "A", "C", "H", "O", 3L, + "A", "D", "H", "P", 4L, + "A", "D", "I", "Q", 5L, + "A", "D", "I", "R", 6L, + "B", "E", "J", "S", 7L, + "B", "E", "J", "T", 8L, + "B", "E", "K", "U", 9L, + "B", "F", "K", "V", 10L, + "B", "F", "L", "W", 11L +) + + +test_that("Errors generate as expected", { + expect_error(collapse_row_labels(1, blah, blah), "x must be a data frame") + expect_error( + collapse_row_labels(dat, row_label1, row_label2, indent = 1), + "indent must be a character string" + ) + expect_error( + collapse_row_labels(dat, row_label1, missing_col), + "Columns provided to dots are missing from x." + ) + expect_error( + collapse_row_labels(dat, row_label1, "row_label2"), + "Columns provided to dots must be provided as unquoted symbols." + ) + + expect_error( + collapse_row_labels(dat, row_label1, row_label2, target_col = "RL"), + "target_col must be provided as an unquoted symbol." + ) + expect_error( + collapse_row_labels(dat, row_label1), + "Must have two or more columns to collapse" + ) +}) + +test_that("Row labels collapse appropriately", { + + x <- collapse_row_labels(dat, row_label1, row_label2, row_label3, row_label4) + + expect_equal( + x$row_label[1:6], + c("A", " C", " G", " M", " N", "A") + ) + + x <- collapse_row_labels(dat, row_label1, row_label2, row_label3, row_label4, indent = " ") + expect_equal( + x$row_label[1:6], + c("A", " C", " G", " M", " N", "A") + ) + + x <- collapse_row_labels(dat, row_label1, row_label2, row_label3) + expect_equal(names(x), c("row_label", "row_label4", "var1")) + expect_equal( + x$row_label[1:6], + c("A", " C", " G", " G", " H", "A") + ) + expect_equal( + x$row_label4[1:6], + c("", "", "M", "N", "O", "") + ) + + x <- collapse_row_labels(dat, row_label1, row_label2, row_label3, target_col = rl) + expect_equal(names(x), c("rl", "row_label4", "var1")) +}) diff --git a/tests/testthat/test-count.R b/tests/testthat/test-count.R index c61f0d29..546da515 100644 --- a/tests/testthat/test-count.R +++ b/tests/testthat/test-count.R @@ -440,6 +440,17 @@ test_that("set_denom_where works as expected", { expect_snapshot_output(dput(t13)) }) +test_that("set_denom_where errors for incompatible object type", { + t1 <- tplyr_table(mtcars, gear) + + # Modify the object type to make it incompatible + class(t1) <- "environment" + + # Function errors + t1 <- set_denom_where(t1, mpg != 21) %>% + expect_error("Object type should be") +}) + test_that("missing counts can be set without a format and it inherits the layer format", { t1 <- tplyr_table(mtcars, gear) %>% add_layer( @@ -690,6 +701,7 @@ test_that("test IBM rounding option", { group_count(gender, by = "Gender") %>% set_format_strings(f_str("xxx (xxx%)", n, pct)) ) + expect_warning({tabl2 <- build(tabl2)}, "You have enabled IBM Rounding.") expect_equal(tabl2$var1_Placebo, c("485 ( 49%)", "515 ( 52%)")) @@ -710,16 +722,23 @@ test_that("test specific rounding proplem #124", { options(tplyr.IBMRounding = FALSE) }) -test_that("nested count layers will error out if second variable is bigger than the first", { - mtcars <- mtcars2 - mtcars$grp <- paste0("grp.", as.numeric(mtcars$cyl) + rep(c(0, 0.5), 16)) +test_that("nested count can accept data if second variable is bigger than the first", { + test_adcm <- data.frame( + SUBJID = c("1", "2", "3"), + ATC2 = c("Antiemetics and antinauseants", "Psycholeptics", "Psycholeptics"), + CMDECOD = c("Promethazine hydrochloride", "Promethazine hydrochloride", "Promethazine hydrochloride"), + TRT101A = c("TRT1", "TRT2", "TRT1") + ) - t <- tplyr_table(mtcars, gear) %>% + x <- test_adcm %>% + tplyr_table(TRT101A) %>% add_layer( - group_count(vars(grp, cyl)) - ) + group_count(vars(ATC2, CMDECOD)) + ) %>% + build() %>% + as.data.frame() - expect_snapshot_error(build(t)) + expect_snapshot(x) }) test_that("Posix columns don't cause the build to error out.", { @@ -886,6 +905,9 @@ test_that("nested count layers error out when you try to add a total row", { ) expect_snapshot_error(build(tab)) + + # The weird use of mtcars2 makes us have to overwrite this again + mtcars <- mtcars2 }) test_that("Tables with pop_data can accept a layer level where", { @@ -923,3 +945,166 @@ test_that("Regression test to make sure cols produce correct denom", { expect_snapshot(t) }) + +test_that("Error checking for add_missing_subjects_row()", { + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row("blah") + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + add_missing_subjects_row(f_str("xx", distinct_n), sort_value = "x") + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + }, "`add_missing_subjects_row` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) + ) + + ## ---- + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(3) + ) + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(AEDECOD) %>% + set_missing_subjects_row_label(c("x", "y")) + ) + ) + + expect_error({ + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_shift(vars(AEBODSYS, AEDECOD)) %>% + set_missing_subjects_row_label("x") + )}, "`set_missing_subjects_row_label` for shift layers" + ) + + expect_snapshot_error( + tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_desc(RACEN) %>% + set_missing_subjects_row_label("x") + ) + ) + +}) + +test_that("Missing subjects row calculates correctly", { + x <- tplyr_table(tplyr_adlb, TRTA, cols=SEX) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(ANRIND, by = vars(PARAM, AVISIT)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx", distinct_n)) + ) %>% + build() + + # Check 1 + in_res1 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Blood Urea Nitrogen (mmol/L)", row_label2 == "Week 12") %>% + pull(var1_Placebo_F) %>% + as.numeric() + + pop1 <- tplyr_adsl %>% + filter(TRT01A == "Placebo", SEX == "F") %>% + nrow() + + dat1 <- tplyr_adlb %>% + filter(PARAM == "Blood Urea Nitrogen (mmol/L)", AVISIT == "Week 12", TRTA == "Placebo", SEX == "F") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop1-dat1, in_res1) + + # Check 2 + in_res2 <- x %>% + filter(row_label3 == "Missing", row_label1 == "Gamma Glutamyl Transferase (U/L)", row_label2 == "Week 24") %>% + pull(`var1_Xanomeline Low Dose_M`) %>% + as.numeric() + + pop2 <- tplyr_adsl %>% + filter(TRT01A == "Xanomeline Low Dose", SEX == "M") %>% + nrow() + + dat2 <- tplyr_adlb %>% + filter(PARAM == "Gamma Glutamyl Transferase (U/L)", AVISIT == "Week 24", TRTA == "Xanomeline Low Dose", SEX == "M") %>% + distinct(USUBJID) %>% + nrow() + + expect_equal(pop2-dat2, in_res2) + +}) + +test_that("Missing counts on nested count layers function correctly", { + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) %>% + build() + + expect_equal(nrow(x %>% filter(row_label2 == " Missing")), 1) + expect_equal(tail(x, 1)$ord_layer_2, Inf) + + # Verify that bycount works for missing values and sort value is assigned correctly + x <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) + ) %>% + build() + + expect_equal(tail(x, 1)$ord_layer_2, 99999) + + # Also test that label reassignment flows + # The warning here is intentional + expect_snapshot_warning({ + x <- tplyr_table(tplyr_adsl, TRT01A) %>% + add_layer( + group_count(vars(SEX, RACE)) %>% + set_order_count_method(c("byfactor", "byvarn")) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = 99999) %>% + set_missing_subjects_row_label("New label") + ) %>% + build() + }) + + expect_equal(filter(x, row_label2 == " New label")$ord_layer_2, c(99999, 99999)) +}) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R new file mode 100644 index 00000000..ea2b1ce3 --- /dev/null +++ b/tests/testthat/test-data.R @@ -0,0 +1,6 @@ +test_that("get_data_labels", { + expect_snapshot(get_data_labels(tplyr_adsl)) + expect_snapshot(get_data_labels(tplyr_adae)) + expect_snapshot(get_data_labels(tplyr_adas)) + expect_snapshot(get_data_labels(tplyr_adlb)) +}) diff --git a/tests/testthat/test-desc.R b/tests/testthat/test-desc.R index 3105340a..6a528360 100644 --- a/tests/testthat/test-desc.R +++ b/tests/testthat/test-desc.R @@ -141,3 +141,24 @@ test_that("Stats as columns properly transposes the built data", { expect_snapshot(as.data.frame(d2)) }) + +test_that("Infinites aren't produced from min/max", { + dat <- tibble::tribble( + ~x1, ~x2, + 'a', 1, + 'a', 2, + 'b', NA, + ) + + t <- tplyr_table(dat, x1) %>% + add_layer( + group_desc(x2) %>% + set_format_strings( + "Min, Max" = f_str("xx, xx", min, max) + ) + ) + + x <- suppressWarnings(build(t)) + + expect_equal(x$var1_b, "") +}) diff --git a/tests/testthat/test-format.R b/tests/testthat/test-format.R index b618ba08..33e8bc14 100644 --- a/tests/testthat/test-format.R +++ b/tests/testthat/test-format.R @@ -256,3 +256,25 @@ test_that("Format string setting and autoprecision are detected appropriately", expect_equal(fmt26$settings, s26) }) + +test_that("Ellipsis unpacking of external variables functions effectively - (#111)", { + + # Define a list of f_str's + num_formats <- list( + "N" = f_str("xx", n), + "Mean (SD)" = f_str("xx.x (xx.xx)", mean, sd), + "Median" = f_str("xx.x", median), + "Q1" = f_str("xx", q1), + "Q3" = f_str("xx", q3), + "Min" = f_str("xx", min), + "Max" = f_str("xx", max) + ) + + # `add_layers()` example, create the tplyr_table + t <- tplyr_table(iris, Species) + + l <- group_desc(t, Petal.Length) %>% + set_format_strings(!!!num_formats) + + expect_identical(num_formats, l$format_strings) +}) diff --git a/tests/testthat/test-layering.R b/tests/testthat/test-layering.R index 9b76f6ad..62fcc244 100644 --- a/tests/testthat/test-layering.R +++ b/tests/testthat/test-layering.R @@ -123,4 +123,22 @@ test_that("Layers accept names when specified", { }) +test_that("add_layer can see calling environment objects", { + tfunc <- function(){ + + prec <- tibble::tribble( + ~vs, ~max_int, ~max_dec, + 0, 1, 1, + 1, 2, 2 + ) + + tplyr_table(mtcars, gear) %>% + add_layer( + group_desc(wt, by = vs) %>% + set_precision_data(prec) + ) + } + + expect_silent(tfunc()) +}) diff --git a/tests/testthat/test-meta.R b/tests/testthat/test-meta.R index 3400c209..a8b8e9b2 100644 --- a/tests/testthat/test-meta.R +++ b/tests/testthat/test-meta.R @@ -86,6 +86,8 @@ test_that("Metadata creation errors generate properly", { # Not providing metadata object expect_snapshot_error(add_variables(mtcars, quos(a))) expect_snapshot_error(add_filters(mtcars, quos(a==1))) + expect_snapshot_error(add_anti_join(mtcars, m, quos(a==1))) + expect_snapshot_error(add_anti_join(m, mtcars, quos(a==1))) # Didn't provide filter expect_snapshot_error(tplyr_meta(quos(a), 'x')) @@ -94,6 +96,7 @@ test_that("Metadata creation errors generate properly", { # Didn't provide names expect_snapshot_error(tplyr_meta('x')) expect_snapshot_error(add_variables(m, 'x')) + expect_snapshot_error(add_anti_join(m, m, 'x')) }) @@ -105,9 +108,12 @@ test_that("Exported metadata function construct metadata properly", { m <- add_variables(m, quos(x)) m <- add_filters(m, quos(x=="a")) + m2 <- add_anti_join(m, m, quos(y)) expect_equal(m$names, quos(a, b, c, x)) expect_equal(m$filters, quos(a==1, b==2, c==3, x=="a")) + expect_equal(m2$anti_join$join_meta, m) + expect_equal(m2$anti_join$on, quos(y)) }) test_that("Descriptive Statistics metadata backend assembles correctly", { @@ -352,3 +358,99 @@ test_that("Metadata print method is accurate", { x <- tplyr_meta(quos(a, b, c), quos(a==1, b==2, c==3, x=="a")) expect_snapshot(print(x)) }) + + +test_that("Anti-join extraction works properly", { + + # This is purposefully a convoluted warning that's unrealistic, hence the + # warning that's generating. + expect_snapshot_warning({ + t <- tplyr_table(tplyr_adsl, TRT01A, cols = ETHNIC) %>% + add_layer( + group_count(RACE, by = SEX) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row() + ) + }) + + x <- build(t, metadata=TRUE) + + # Check that the object looks right + res <- get_meta_result(t, 'c7_1', 'var1_Placebo_HISPANIC OR LATINO') + + expect_equal(unname(map_chr(res$names, as_label)), c("TRT01A", "SEX", "ETHNIC", "RACE")) + expect_equal( + unname(map_chr(res$filters, as_label)), + c("TRT01A == c(\"Placebo\")", "SEX == c(\"F\")", "ETHNIC == c(\"HISPANIC OR LATINO\")", + "TRUE", "TRUE") + ) + expect_equal(unname(map_chr(res$anti_join$join_meta$names, as_label)), c("TRT01A", "ETHNIC")) + expect_equal( + unname(map_chr(res$anti_join$join_meta$filters, as_label)), + c("TRT01A == c(\"Placebo\")", "ETHNIC == c(\"HISPANIC OR LATINO\")", "TRUE", "TRUE") + ) + expect_equal(as_label(res$anti_join$on[[1]]), "USUBJID") + + # Variables needed for the merge aren't there + expect_snapshot_error(get_meta_subset(t, 'c7_1', 'var1_Placebo_HISPANIC OR LATINO', add_cols = quos(SITEID))) + + + sbst <- get_meta_subset(t, 'c7_1', 'var1_Placebo_HISPANIC OR LATINO') + + + cmp <- tplyr_adsl %>% filter( + USUBJID == "01-701-1023" + ) + + # The counted subjects will include female, so this subject would have to be male + # Again - this is a weird example that wouldn't be used in practice, but this is the + # row variable + expect_true(cmp$SEX == "M") + # Since this is column, these would both match the metadata + expect_true(cmp$TRT01A == "Placebo") + expect_true(cmp$ETHNIC == "HISPANIC OR LATINO") + + # and then selecting out the columns these should match + expect_equal( + sbst, + cmp %>% + select(USUBJID, TRT01A, ETHNIC) + ) + + # Now for a real example, but also test for nested counts + t <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) + + x <- build(t, metadata=TRUE) + + sbst <- get_meta_subset(t, 'c23_1', 'var1_Placebo') + + # If you manually check out x, the count here is 65 + expect_equal(nrow(sbst), 65) + expect_equal(unique(sbst$TRT01A), "Placebo") + +}) + +test_that("Tplyr meta print method works as expected", { + meta <- tplyr_meta( + names = quos(TRTP, EFFFL, ITTFL, ANL01FL, SITEGR1, AVISIT, AVISITN, PARAMCD, AVAL, BASE, CHG), + filters = quos(EFFFL == "Y", ITTFL == "Y", PARAMCD == "ACTOT", ANL01FL == "Y", AVISITN == 24) + ) + + meta2 <- meta %>% + add_anti_join( + join_meta = tplyr_meta( + names = quos(TRT01P, EFFFL, ITTFL, SITEGR1), + filters = quos(EFFFL == "Y", ITTFL == "Y") + ), + on = quos(USUBJID) + ) + + expect_snapshot(print(meta2)) +}) diff --git a/tests/testthat/test-replace_leading_whitespace.R b/tests/testthat/test-replace_leading_whitespace.R new file mode 100644 index 00000000..d8ad0098 --- /dev/null +++ b/tests/testthat/test-replace_leading_whitespace.R @@ -0,0 +1,15 @@ +test_that("Test replacement of leading whitespace", { + x <- c("Hello there", " Goodbye Friend ", "\tNice to meet you", " \t What are you up to? \t \t ") + + expect_equal( + replace_leading_whitespace(x), + c("Hello there", "  Goodbye Friend ", + "    Nice to meet you", "       What are you up to? \t \t ") + ) + + expect_equal( + replace_leading_whitespace(x, tab=2), + c("Hello there", "  Goodbye Friend ", + "  Nice to meet you", "     What are you up to? \t \t ") + ) +}) diff --git a/tests/testthat/test-set_limit_data_by.R b/tests/testthat/test-set_limit_data_by.R new file mode 100644 index 00000000..1f0e2bae --- /dev/null +++ b/tests/testthat/test-set_limit_data_by.R @@ -0,0 +1,138 @@ +test_that("Descriptive statistics data limiting works properly", { + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(18, 18, 18, 18)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(6, 18, 6, 18)) + + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_desc(AVAL, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(6, 18, 18)) +}) + + +test_that("Shift layers can also handle data limiting", { + + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_shift(vars(row=BASEC, column=AVALC), by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) + +}) + + +test_that("Count data limiting works properly", { + + t1 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) + ) + + x1 <- build(t1) + + cnts1 <- dplyr::count(x1, row_label1, row_label2) + expect_equal(cnts1$n, c(9, 9, 9, 9)) + + t2 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) + + x2 <- build(t2) + + cnts2 <- dplyr::count(x2, row_label1, row_label2) + expect_equal(cnts2$n, c(3, 9, 3, 9)) + + t3 <- tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PECAT, PARAM, AVISIT)) %>% + set_limit_data_by(PECAT, PARAM, AVISIT) + ) + + x3 <- build(t3) + + cnts3 <- dplyr::count(x3, row_label1, row_label2) + expect_equal(cnts3$n, c(3, 9, 9)) +}) + + +test_that("Nested count layers limit data accurately", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AEOUT, AEDECOD) + ) + + t_ae_df1 <- t_ae1 %>% + build() %>% select(-starts_with('ord')) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) %>% + set_limit_data_by(AESEV, AEOUT, AEDECOD) + ) + + t_ae_df2 <- t_ae2 %>% + build() %>% select(-starts_with('ord')) + + dropped_rows <- anti_join( + t_ae_df1, + t_ae_df2, + by=names(t_ae_df1) + ) + + check <- c(dropped_rows$var1_Placebo, dropped_rows$`var1_Xanomeline High Dose`, dropped_rows$`var1_Xanomeline Low Dose`) + expect_true(all(check == " 0 ( 0.0%)")) +}) diff --git a/tests/testthat/test-sort.R b/tests/testthat/test-sort.R index c77458dd..e32fa740 100644 --- a/tests/testthat/test-sort.R +++ b/tests/testthat/test-sort.R @@ -264,3 +264,39 @@ test_that("by variables get sorted with varn/factors in the correct order", { expect_equal(t3[["row_label1"]], c("1", "1", "1", "0", "0", "0")) expect_equal(t3[["ord_layer_1"]], c(1, 1, 1, 2, 2, 2)) }) + + +# Added to address #175 +test_that("Nested counts with by variables process properly", { + + t_ae1 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD), by = vars(AESEV, AEOUT)) + %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df1 <- t_ae1 %>% + build() + + # This is verifying that the right number of combinations of row_labels exist, and that + # there aren't duplicate order values for the outer layer + expect_equal(nrow(dplyr::count(t_ae_df1, row_label2, row_label3, ord_layer_3)), 6) + + t_ae2 <- tplyr_table(tplyr_adae, TRTA) %>% + add_layer( + group_count(vars("testing", AEDECOD), by=AEOUT) %>% + set_order_count_method("bycount") %>% + set_ordering_cols("Xanomeline High Dose") %>% + set_result_order_var(distinct_n) + ) + + t_ae_df2 <- t_ae2 %>% + build() + + # Same test but now working with a text outer layer and one by variable + expect_equal(nrow(dplyr::count(t_ae_df2, row_label2, ord_layer_2)), 2) + +}) diff --git a/tests/testthat/test-table.R b/tests/testthat/test-table.R index 0ad6340b..cb7cc564 100644 --- a/tests/testthat/test-table.R +++ b/tests/testthat/test-table.R @@ -1,9 +1,3 @@ -test_that("tplyr_table returns an empty envrionment of class 'tplyr_table' when passed no arguemnts", { - st <- tplyr_table() - expect_true(is.environment(st)) - expect_equal(length(rlang::env_names(st)), 0) -}) - test_that("tplyr_table returns a class of tplyr_table and environment", { tab <- tplyr_table(data.frame(a = 1:10, b = 11:20), a) expect_s3_class(tab, "tplyr_table") diff --git a/vignettes/Tplyr.Rmd b/vignettes/Tplyr.Rmd index 5c2068b9..d22f99e8 100644 --- a/vignettes/Tplyr.Rmd +++ b/vignettes/Tplyr.Rmd @@ -21,9 +21,6 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adae.Rdata") -load("adsl.Rdata") -load("adlb.Rdata") ``` # How **Tplyr** Works @@ -56,7 +53,7 @@ When a `tplyr_table()` is created, it will contain the following bindings: The function `tplyr_table()` allows you a basic interface to instantiate the object. Modifier functions are available to change individual parameters catered to your analysis. ```{r tplyr_table} -t <- tplyr_table(adsl, TRT01P, where = SAFFL == "Y") +t <- tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") t ``` @@ -90,7 +87,7 @@ Everyone has their own style of coding - so we've tried to be flexible to an ext There are two ways to add layers to a `tplyr_table()`: `add_layer()` and `add_layers()`. The difference is that `add_layer()` allows you to construct the layer within the call to `add_layer()`, whereas with `add_layers()` you can attach multiple layers that have already been constructed upfront: ```{r add_layer} -t <- tplyr_table(adsl, TRT01P) %>% +t <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories n (%)") ) @@ -100,7 +97,7 @@ t <- tplyr_table(adsl, TRT01P) %>% Within `add_layer()`, the syntax to constructing the count layer for Age Categories was written on the fly. `add_layer()` is special in that it also allows you to use piping to use modifier functions on the layer being constructed ```{r add_layer_with_piping} -t <- tplyr_table(adsl, TRT01P) %>% +t <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories n (%)") %>% set_format_strings(f_str("xx (xx.x%)", n, pct)) %>% @@ -112,7 +109,7 @@ t <- tplyr_table(adsl, TRT01P) %>% `add_layers()`, on the other hand, lets you isolate the code to construct a particular layer if you wanted to separate things out more. Some might find this cleaner to work with if you have a large number of layers being constructed. ```{r add_layers} -t <- tplyr_table(adsl, TRT01P) +t <- tplyr_table(tplyr_adsl, TRT01P) l1 <- group_count(t, AGEGR1, by = "Age categories n (%)") l2 <- group_desc(t, AGE, by = "Age (years)") @@ -129,7 +126,7 @@ Notice that when you construct a `tplyr_table()` or a `tplyr_layer()` that what To generate the data from a `tplyr_table()` object, you use the function `build()`: ```{r build} -t <- tplyr_table(adsl, TRT01P) %>% +t <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories n (%)") ) @@ -162,7 +159,7 @@ So - why is this object necessary. Consider this example: ```{r format_strings_1} -t <- tplyr_table(adsl, TRT01P) %>% +t <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Age (years)") %>% set_format_strings( @@ -199,7 +196,7 @@ This simple user input controls a significant amount of work in the back end of `f_str()` objects are also used with count layers as well to control the data presentation. Instead of specifying the summaries performed, you use `n`, `pct`, `distinct_n`, and `distinct_pct` for your parameters and specify how you would like the values displayed. Using `distinct_n` and `distinct_pct` should be combined with specifying a `distinct_by()` variable using `set_distinct_by()`. ```{r format_strings_2} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories") %>% set_format_strings(f_str('xx (xx.x)',n,pct)) @@ -207,7 +204,7 @@ tplyr_table(adsl, TRT01P) %>% build() %>% kable() -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories") %>% set_format_strings(f_str('xx',n)) @@ -220,7 +217,7 @@ tplyr_table(adsl, TRT01P) %>% Really - format strings allow you to present your data however you like. ```{r format_strings_3} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories") %>% set_format_strings(f_str('xx (•◡•) xx.x%',n,pct)) @@ -238,7 +235,7 @@ But should you? Probably not. As covered under string formatting, `set_format_strings()` controls a great deal of what happens within a descriptive statistics layer. Note that there are some built in defaults to what's output: ```{r desc1} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Age (years)") ) %>% @@ -249,7 +246,7 @@ tplyr_table(adsl, TRT01P) %>% To override these defaults, just specify the summaries that you want to be performed using `set_format_strings()` as described above. But what if **Tplyr** doesn't have a built in function to do the summary statistic that you want to see? Well - you can make your own! This is where `set_custom_summaries()` comes into play. Let's say you want to derive a geometric mean. ```{r custom_summaries} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Sepal Length") %>% set_custom_summaries( @@ -269,7 +266,7 @@ In `set_custom_summaries()`, first you name the summary being performed. This is Sometimes there's a need to present multiple variables summarized side by side. **Tplyr** allows you to do this as well. ```{r desc2} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(vars(AGE, AVGDD), by = "Age and Avg. Daily Dose") ) %>% @@ -285,7 +282,7 @@ tplyr_table(adsl, TRT01P) %>% Count layers generally allow you to create "n" and "n (%)" count type summaries. There are a few extra features here as well. Let's say that you want a total row within your counts. This can be done with `add_total_row()`: ```{r count_total1} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(AGEGR1, by = "Age categories") %>% add_total_row() @@ -298,7 +295,7 @@ tplyr_table(adsl, TRT01P) %>% Sometimes it's also necessary to count summaries based on distinct values. **Tplyr** allows you to do this as well with `set_distinct_by()`: ```{r count_distinct} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count('Subjects with at least one adverse event') %>% set_distinct_by(USUBJID) %>% @@ -314,7 +311,7 @@ There's another trick going on here - to create a summary with row label text li Adverse event tables often call for counting AEs of something like a body system and counting actual events within that body system. **Tplyr** has means of making this simple for the user as well. ```{r count_nested} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(vars(AEBODSYS, AEDECOD)) ) %>% @@ -331,10 +328,10 @@ Lastly, let's talk about shift layers. A common example of this would be looking ```{r shift1} # Tplyr can use factor orders to dummy values and order presentation -adlb$ANRIND <- factor(adlb$ANRIND, c("L", "N", "H")) -adlb$BNRIND <- factor(adlb$BNRIND, c("L", "N", "H")) +tplyr_adlb$ANRIND <- factor(tplyr_adlb$ANRIND, c("L", "N", "H")) +tplyr_adlb$BNRIND <- factor(tplyr_adlb$BNRIND, c("L", "N", "H")) -tplyr_table(adlb, TRTA, where = PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where = PARAMCD == "CK") %>% add_layer( group_shift(vars(row=BNRIND, column=ANRIND), by=PARAM) %>% set_format_strings(f_str("xx (xxx%)", n, pct)) diff --git a/vignettes/count.Rmd b/vignettes/count.Rmd index ca77ee72..969d31eb 100644 --- a/vignettes/count.Rmd +++ b/vignettes/count.Rmd @@ -20,9 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr, warn.conflicts = FALSE) library(knitr) -load("adsl.Rdata") -load("adae.Rdata") -load("adlb.Rdata") ``` At the surface, counting sounds pretty simple, right? You just want to know how many occurrences of something there are. Well - unfortunately, it's not that easy. And in clinical reports, there's quite a bit of nuance that goes into the different types of frequency tables that need to be created. Fortunately, we’ve added a good bit of flexibility into `group_count()` to help you get what you need when creating these reports, whether you’re creating a demographics table, adverse events, or lab results. @@ -32,7 +29,7 @@ At the surface, counting sounds pretty simple, right? You just want to know how Let's start with a basic example. This table demonstrates the distribution of subject disposition across treatment groups. Additionally, we're sorting by descending total occurrences using the "Total" group. ```{r} -t <- tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% +t <- tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% add_total_group() %>% add_treat_grps(Treated = c("Xanomeline Low Dose", "Xanomeline High Dose")) %>% add_layer( @@ -55,7 +52,7 @@ Another exceptionally important consideration within count layers is whether you **Tplyr** allows you to focus on these distinct counts and distinct percents within some grouping variable, like subject. Additionally, you can mix and match with the distinct counts with non-distinct counts in the same row too. The `set_distinct_by()` function sets the variables used to calculate the distinct occurrences of some value using the specified `distinct_by` variables. ```{r} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -72,7 +69,7 @@ You may have seen tables before like the one above. This display shows the numbe An additional option for formatting the numbers above would be using 'parenthesis hugging'. To trigger this, on the integer side of a number use a capital 'X' or a capital 'A'. For example: ```{r} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -96,7 +93,7 @@ One way to approach this would be creating two summaries. One summarizing the bo The example below demonstrates how to do a nested summary. Look at the first row - here `row_label1` and `row_label2` are both "CARDIAC DISORDERS". This line is the summary for `AEBODSYS.` In the rows below that, `row_label1` continues on with the value "CARDIAC DISORDERS", but `row_label2` changes. These are the summaries for `AEDECOD`. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(vars(AEBODSYS, AEDECOD)) ) %>% @@ -108,7 +105,7 @@ tplyr_table(adae, TRTA) %>% This accomplishes what we needed, but it's not exactly the presentation you might hope for. We have a solution for this as well. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(vars(AEBODSYS, AEDECOD)) %>% set_nest_count(TRUE) %>% diff --git a/vignettes/custom-metadata.Rmd b/vignettes/custom-metadata.Rmd index 5b75c0bc..108a6f05 100644 --- a/vignettes/custom-metadata.Rmd +++ b/vignettes/custom-metadata.Rmd @@ -25,11 +25,9 @@ library(knitr) ``` ```{r data prep, echo=FALSE} -load("adas.Rdata") -load("adsl.Rdata") -t <- tplyr_table(adas, TRTP, where=EFFFL == "Y" & ITTFL == "Y" & PARAMCD == "ACTOT" & ANL01FL == "Y") %>% - set_pop_data(adsl) %>% +t <- tplyr_table(tplyr_adas, TRTP, where=EFFFL == "Y" & ITTFL == "Y" & PARAMCD == "ACTOT" & ANL01FL == "Y") %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01P) %>% set_pop_where(EFFFL == "Y" & ITTFL == "Y") %>% set_distinct_by(USUBJID) %>% @@ -161,6 +159,21 @@ When building a data frame for use with `tplyr_table` metadata, there are really The `row_id` values built by **Tplyr** will always follow the format "n_n", where the first letter of the layer type will either be "c", "d", or "s". The next number is the layer number (i.e. the order in which the layer was inserted to the **Tplyr** table), and then finally the row of that layer within the output. For example, the third row of a count layer that was the second layer in the table would have a `row_id` of "c2_3". In this example, I chose "x4_n" as the format for the "x" to symbolize custom, and these data can be thought of as the fourth layer. That said, these values would typically be masked by the viewer of the table so they really just need to be unique - so you can choose whatever you want. +### Anti-joins + +If the custom metadata you're constructing requires references to data outside your target dataset, this is also possible with a `tplyr_meta` object. If you're looking for non-overlap with the target dataset, you can use an anti-join. Anti-joins can be added to a `tplyr_meta` object using the `add_anti_join()` function. + + +```{r anti_join1} +meta %>% + add_anti_join( + join_meta = tplyr_meta( + names = quos(TRT01P, EFFFL, ITTFL, SITEGR1), + filters = quos(EFFFL == "Y", ITTFL == "Y") + ), + on = quos(USUBJID) + ) +``` ## Appending Existing **Tplyr** Metadata Now that we've created our custom extension of the **Tplyr** metadata, let's extend the existing data frame. To do this, **Tplyr** has the function `append_metadata()`: @@ -188,7 +201,7 @@ get_meta_subset(t, 'x4_1', "var1_Xanomeline High Dose") %>% You very well may have a scenario where you want to use these metadata functions outside of **Tplyr** in general. As such, there are S3 methods available to query metadata from a dataframe instead of a **Tplyr** table, and parameters to provide your own target data frame: ```{r metadata without Tplyr} -get_meta_subset(eff_meta, 'x4_1', "var1_Xanomeline High Dose", target=adas) %>% +get_meta_subset(eff_meta, 'x4_1', "var1_Xanomeline High Dose", target=tplyr_adas) %>% head() %>% kable() ``` diff --git a/vignettes/denom.Rmd b/vignettes/denom.Rmd index 3caefcc2..cb657c4e 100644 --- a/vignettes/denom.Rmd +++ b/vignettes/denom.Rmd @@ -20,9 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr, warn.conflicts = FALSE) library(knitr) -load("adsl.Rdata") -load("adae.Rdata") -load("adlb.Rdata") ``` Counting is pretty easy, right? There's not all that much to it. With a few considerations we can cover most of the scenarios that users will encounter while using Tplyr. Denominators, on the other hand, get *_a lot_* more complicated. Why? Because there are a lot of ways to do it. What values do we exclude from the denominator? What variables establish denominator grouping? Does the denominator use a different filter than the values being counted? If you've programmed enough of these tables, you know that it's all very situational. @@ -40,7 +37,7 @@ For this reason,**Tplyr** allows lets you set a separate population dataset - bu Consider these two examples. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -52,8 +49,8 @@ tplyr_table(adae, TRTA) %>% ``` ```{r} -tplyr_table(adae, TRTA) %>% - set_pop_data(adsl) %>% +tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% add_layer( group_count(AEDECOD) %>% @@ -84,10 +81,10 @@ When you're looking within a single dataset, there are a couple factors that you Most of the complexity of denominators comes from nuanced situations. A solid 80% of the time, defaults will work. For example, in a frequency table, you will typically want data within a column to sum to 100%. For example: ```{r} -adsl <- adsl %>% +tplyr_adsl <- tplyr_adsl %>% mutate(DCSREAS = ifelse(DCSREAS == '', 'Completed', DCSREAS)) -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(DCSREAS) ) %>% @@ -97,7 +94,7 @@ tplyr_table(adsl, TRT01P) %>% By default, when not using the population data strategy shown above, a count layer assumes that you want columns to sum to 100%. But that's not always the case. Perhaps you'd like to break this summary down by sex presented row-wise. ```{r} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(DCSREAS, by=SEX) ) %>% @@ -108,7 +105,7 @@ tplyr_table(adsl, TRT01P) %>% Ok - so, now this is a little bit off. By breaking sex down as a row group, the denominators are still the total treatment group. Does that make sense? 34 female Placebo group subjects completed, but that calculated 39.5% also includes male subjects in the denominator. Let's fix this using `set_denoms_by()`. ```{r} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(DCSREAS, by=SEX) %>% set_denoms_by(SEX, TRT01P) @@ -130,7 +127,7 @@ Depending on your presentation, what you require may change - but the flexibilit A major part of the shift API is the control of the denominators used in the calculation of the percentages. In shift tables, most percentages are relative to the "box" that is formed from the "from" and "to" groups of the shift for each treatment group. Just like the count layers, the `set_denoms_by()` functions any variable name from the treatment variable, `cols` argument, `by` variables. The difference with shift layers is that now you can also include your target variables used for the row or column. ```{r} -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, AVISIT)) %>% set_format_strings(f_str("xx (xxx.x%)", n, pct)) %>% @@ -146,7 +143,7 @@ In the example above, the denominators were based on the by and treatment variab In the next example, the percentage denominators are calculated row-wise, each row percentage sums to 100%. ```{r} -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, AVISIT)) %>% set_format_strings(f_str("xx (xxx.x%)", n, pct)) %>% @@ -161,7 +158,7 @@ tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% While not practical, in this last example the denominators are changed to be based on the entire column instead of the 3 x 3 box. By passing the column variables, `TRTA` and `ANRIND` the layer will use those denominators when determining the percentages. ```{r} -tplyr_table(adlb, TRTA, where = PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where = PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, AVISIT)) %>% set_format_strings(f_str("xx (xx.xx%)", n, pct)) %>% @@ -188,10 +185,10 @@ So let's take the example shown below. The first layer has no layer level filter The third layer has a layer level filter applied, but additionally uses `set_denom_where()`. The `set_denom_where()` in this example is actually *removing* the layer level filter for the denominators. This is because in R, when you filter using `TRUE`, the filter returns all records. So by using `TRUE` in `set_denom_where()`, the layer level filter is effectively removed. This causes the denominator to include all values available from the table and not just those selected for that layer - so for this layer, the percentages will *not add up to 100%*. This is important - this allows the percentages from Layer 3 to sum to the total percentage of "DISCONTINUED" from Layer 1. ```{r} -adsl2 <- adsl %>% +tplyr_adsl2 <- tplyr_adsl %>% mutate(DISCONTEXT = if_else(DISCONFL == 'Y', 'DISCONTINUED', 'COMPLETED')) -t <- tplyr_table(adsl2, TRT01P, where = SAFFL == 'Y') %>% +t <- tplyr_table(tplyr_adsl2, TRT01P, where = SAFFL == 'Y') %>% add_layer( group_count(DISCONTEXT) ) %>% @@ -218,10 +215,10 @@ The `set_missing_count()` function can take a new `f_str()` object to set the di In the example below 50 random values are removed and NA is specified as the missing string. This leads us to another parameter - `denom_ignore`. By default, if you specify missing values they will still be considered within the denominator, but when you have missing counts, you may wish to exclude them from the totals being summarized. By setting `denom_ignore` to TRUE, your denominators will ignore any groups of missing values that you've specified. ```{r} -adae2 <- adae -adae2[sample(nrow(adae2), 50), "AESEV"] <- NA +tplyr_adae2 <- tplyr_adae +tplyr_adae2[sample(nrow(tplyr_adae2), 50), "AESEV"] <- NA -t <- tplyr_table(adae2, TRTA) %>% +t <- tplyr_table(tplyr_adae2, TRTA) %>% add_layer( group_count(AESEV) %>% set_format_strings(f_str("xxx (xx.xx%)", n, pct)) %>% @@ -236,6 +233,32 @@ t %>% We did one more other thing worth explaining in the example above - gave the missing count its own sort value. If you leave this field null, it will simply be the maximum value in the order layer plus 1, to put the Missing counts at the bottom during an ascending sort. But tables can be sorted a lot of different ways, as you'll see in the sort vignette. So instead of trying to come up with novel ways for you to control where the missing row goes - we decided to just let you specify your own value. +## Missing Subjects + +Missing counts and counting missing subjects work two different ways within Tplyr. Missing counts, as described above, will examine the records present in the data and collect and missing values. But for these results to be counted, they need to first be provided within the input data itself. On the other hand, missing subjects are calculated by looking at the difference between the potential number of subjects within the column (i.e. the combination of the treatment variables and column variables) and the number of subjects actually present. Consider this example: + +```{r missing_subs1} + missing_subs <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_nest_count(TRUE) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) %>% + set_missing_subjects_row_label("Missing Subjects") + ) %>% + build() + + tail(missing_subs) %>% + select(-starts_with('ord')) %>% + kable() +``` + +In the example above, we produce a nested count layer. The function `add_missing_subjects_row()` triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default to 'Missing'. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. + +Note that in nested count layers, missing subject rows will generate for each independent group within the outer layer. Outer layers cannot have missing subject rows calculated individually. This would best be done in an independent layer itself, as the result would apply to the whole input target dataset. + ## Adding a 'Total' Row In addition to missing counts, some summaries require the addition of a 'Total' row. **Tplyr** has the helper function `add_total_row()` to ease this process for you. Like most other things within **Tplyr** - particularly in this vignette - this too has a significant bit of nuance to it. @@ -250,10 +273,10 @@ More nuance comes in two places: In the example below, we summarize age groups by sex. The denominators are determined by treatment group and sex, and since we are not excluding any values from the denominator, the total row ends up matching the denominator that was used. The 'Missing' row tells us the number of missing values, but because `count_missings` is set to `TRUE`, the missing counts are included in the total row. This probably isn't how you would choose to display things, but here we're trying to show the flexibility built into **Tplyr**. ```{r} -adsl2 <- adsl -adsl2[sample(nrow(adsl2), 50), "AGEGR1"] <- NA +tplyr_adsl2 <- tplyr_adsl +tplyr_adsl2[sample(nrow(tplyr_adsl2), 50), "AGEGR1"] <- NA -tplyr_table(adsl2, TRT01P) %>% +tplyr_table(tplyr_adsl2, TRT01P) %>% add_layer( group_count(AGEGR1, by=SEX) %>% set_denoms_by(TRT01P, SEX) %>% # This gives me a Total row each group @@ -270,7 +293,7 @@ The default text for the Total row is "Total", but we provide `set_total_row_lab Let's look at a more practical version of the table above. If you display missings, you probably want to exclude them from the total. Here we do that using `set_missing_count()`. So more commonly, you'll see this: ```{r} -tplyr_table(adsl2, TRT01P) %>% +tplyr_table(tplyr_adsl2, TRT01P) %>% add_layer( group_count(AGEGR1, by=SEX) %>% set_denoms_by(TRT01P, SEX) %>% # This gives me a Total row each group diff --git a/vignettes/desc.Rmd b/vignettes/desc.Rmd index 3e7b0a52..542db35e 100644 --- a/vignettes/desc.Rmd +++ b/vignettes/desc.Rmd @@ -21,14 +21,12 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adlb.Rdata") -load("adsl.Rdata") ``` Descriptive statistics in **Tplyr** are created using `group_desc()` function when creating a layer. While `group_desc()` allows you to set your target, by variables, and filter criteria, a great deal of the control of the layer comes from `set_format_strings()` where the actual summaries are declared. ```{r intro} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Age (years)", where= SAFFL=="Y") %>% set_format_strings( @@ -77,7 +75,9 @@ x %>% ### Notes About Built-in's -Note that the only non-default option being used in any of the function calls above is `na.rm=TRUE`. For most of the functions, this is likely fine - but with IQR, Q1, and Q3 note that there are several different quantile algorithms available in R. The default we chose to use is the R default of Type 7: +Note that the only non-default option being used in any of the function calls above is `na.rm=TRUE`. It's important to note that for `min` and `max`, when `na.rm=TRUE` is used with a vector that is all `NA`, these functions return `Inf` and `-Inf` respectively. When formatting the numbers, this is unexpected and also inconsistent with how other descriptive statistic functions, which return `NA`. Therefore, just for `min` and `max`, `Inf`'s are converted to `NA` so that they'll align with the behavior of the `empty` parameter in `f_str()`. + +Using default settings of most descriptive statistic functions is typically fine, but with IQR, Q1, and Q3 note that there are several different quantile algorithms available in R. The default we chose to use is the R default of Type 7: $$ m = 1-p. p[k] = (k - 1) / (n - 1). \textrm{In this case, } p[k] = mode[F(x[k])]. \textrm{This is used by S.} @@ -87,7 +87,7 @@ That said, we still want to offer some flexibility here, so you can change the q The example below demonstrates using the default quantile algorithm in R. ```{r quantile_types_default} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(CUMDOSE) %>% set_format_strings("Q1, Q3" = f_str('xxxxx, xxxxx', q1, q3)) @@ -105,7 +105,7 @@ $$ ```{r quantile_types_sas} options(tplyr.quantile_type = 3) -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(CUMDOSE) %>% set_format_strings("Q1, Q3" = f_str('xxxxx, xxxxx', q1, q3)) @@ -137,7 +137,7 @@ As with any other setting in **Tplyr**, the layer setting will always take prece Let's look at an example. ```{r multi-custom} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(vars(AGE, HEIGHTBL), by = "Sepal Length") %>% set_custom_summaries( @@ -163,7 +163,7 @@ Another note about custom summaries is that you're able to overwrite the default For example, here we use the **Tplyr** default mean. ```{r custom_options} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE) %>% set_format_strings("Mean" = f_str('xx.xx', mean)) @@ -181,7 +181,7 @@ options(tplyr.custom_summaries = ) ) -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE) %>% set_format_strings("Mean" = f_str('xx.xx', mean)) diff --git a/vignettes/desc_layer_formatting.Rmd b/vignettes/desc_layer_formatting.Rmd index 0bad68a8..18e9af53 100644 --- a/vignettes/desc_layer_formatting.Rmd +++ b/vignettes/desc_layer_formatting.Rmd @@ -21,9 +21,6 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adlb.Rdata") -load("adsl.Rdata") -load('adlb.Rdata') ``` A lot of the nuance to formatting descriptive statistics layers was covered in the descriptive statistic layer vignette, but there are a couple more tricks to getting the most out of **Tplyr**. In this vignette, we'll cover some of the options in more detail. @@ -35,14 +32,14 @@ By default, if there is no available value for a summary in a particular observa _Note: **Tplyr** generally respects factor levels - so in instances of a missing row or column group, if the factor level is present, then the variable or row will still generate)_ ```{r missing} -adsl$TRT01P <- as.factor(adsl$TRT01P) -adlb$TRTA <- as.factor(adlb$TRTA) +tplyr_adsl$TRT01P <- as.factor(tplyr_adsl$TRT01P) +tplyr_adlb$TRTA <- as.factor(tplyr_adlb$TRTA) -adlb_2 <- adlb %>% +tplyr_adlb_2 <- tplyr_adlb %>% filter(TRTA != "Placebo") -tplyr_table(adlb_2, TRTA) %>% - set_pop_data(adsl) %>% +tplyr_table(tplyr_adlb_2, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01P) %>% add_layer( group_desc(AVAL, by=PARAMCD) %>% @@ -57,8 +54,8 @@ tplyr_table(adlb_2, TRTA) %>% Note how the entire example above has all records in `var1_Placebo` missing. **Tplyr** gives you control over how you fill this space. Let's say that we wanted instead to make that space say "Missing". You can control this with the `f_str()` object using the `empty` parameter. ```{r missing1} -tplyr_table(adlb_2, TRTA) %>% - set_pop_data(adsl) %>% +tplyr_table(tplyr_adlb_2, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01P) %>% add_layer( group_desc(AVAL, by=PARAMCD) %>% @@ -73,8 +70,8 @@ tplyr_table(adlb_2, TRTA) %>% Look at the `empty` parameter above. Here, we use a named character vector, where the name is `.overall`. When this name is used, if all elements within the cell are missing, they will be filled with the specified text. Otherwise, the provided string will fill just the missing parameter. In some cases, this may not be what you'd like to see. Perhaps we want a string that fills each missing space. ```{r missing2} -tplyr_table(adlb_2, TRTA) %>% - set_pop_data(adsl) %>% +tplyr_table(tplyr_adlb_2, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01P) %>% add_layer( group_desc(AVAL, by=PARAMCD) %>% @@ -95,7 +92,7 @@ You may have noticed that the approach to formatting covered so far leaves a lot **Tplyr** has this covered using auto-precision. Auto-precision allows you to format your numeric summaries based on the precision of the data collected. This has all been built into the format strings, because a natural place to specify your desired format is where you specify how you want your data presented. If you wish to use auto-precision, use `a` instead of `x` when creating your summaries. Note that only one `a` is needed on each side of a decimal. To use increased precision, use `a+n` where `n` is the number of additional spaces you wish to add. ```{r autoprecision1} -tplyr_table(adlb, TRTA) %>% +tplyr_table(tplyr_adlb, TRTA) %>% add_layer( group_desc(AVAL, by = PARAMCD) %>% set_format_strings( @@ -113,7 +110,7 @@ As you can see, the decimal precision is now varying depending on the test being But - this is kind of ugly, isn't it? Do we really need all 6 decimal places collected for CA? For this reason, you're able to set a cap on the precision that's displayed: ```{r autoprecision2} -tplyr_table(adlb, TRTA) %>% +tplyr_table(tplyr_adlb, TRTA) %>% add_layer( group_desc(AVAL, by = PARAMCD) %>% set_format_strings( @@ -132,7 +129,7 @@ Now that looks better. The `cap` argument is part of `set_format_strings()`. You This was a basic situation, but if you're paying close attention, you may have some questions. What if you have more by variables, like by visit AND test. Do we then calculate precision by visit and test? What if collected precision is different per visit and we don't want that? What about multiple summary variables? How do we determine precision then? We have modifier functions for this: ```{r precision3} -tplyr_table(adlb, TRTA) %>% +tplyr_table(tplyr_adlb, TRTA) %>% add_layer( group_desc(vars(AVAL, CHG, BASE), by = PARAMCD) %>% set_format_strings( @@ -167,7 +164,7 @@ prec_data <- tibble::tribble( "URATE", 3, 1, ) -tplyr_table(adlb, TRTA) %>% +tplyr_table(tplyr_adlb, TRTA) %>% add_layer( group_desc(AVAL, by = PARAMCD) %>% set_format_strings( @@ -195,7 +192,7 @@ prec_data <- tibble::tribble( "GGT", 3, 0, ) -tplyr_table(adlb, TRTA) %>% +tplyr_table(tplyr_adlb, TRTA) %>% add_layer( group_desc(AVAL, by = PARAMCD) %>% set_format_strings( @@ -216,7 +213,7 @@ tplyr_table(adlb, TRTA) %>% By default, when using 'x' or 'a', any other character within a format string will stay stationary. Consider the standard example from the descriptive statistic layer vignette. ```{r standard} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Age (years)", where= SAFFL=="Y") %>% set_format_strings( @@ -235,7 +232,7 @@ tplyr_table(adsl, TRT01P) %>% Note that if a certain number of integers are alotted, space will be left for the numbers that fill that space, but the position of the parenthesis stays fixed. In some displays, you may want the parenthesis to 'hug' your number - the "format group" width should stay fixed, the parenthesis should move to the right along with the numbers consuming less integer space. Within your `f_str()`, you can achieve this by using a capital 'X'. For example, focusing on the mean and standard deviation line: ```{r manual_hugging} -tplyr_table(adlb, TRTA, PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, PARAMCD == "CK") %>% add_layer( group_desc(AVAL, by=vars(PARAMCD, AVISIT)) %>% set_format_strings( @@ -251,7 +248,7 @@ tplyr_table(adlb, TRTA, PARAMCD == "CK") %>% Similarly, the same functionality works with auto precision by using a capital A. ```{r auto_hugging} -tplyr_table(adlb, TRTA, PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, PARAMCD == "CK") %>% add_layer( group_desc(AVAL, by=vars(PARAMCD, AVISIT)) %>% set_format_strings( diff --git a/vignettes/general_string_formatting.Rmd b/vignettes/general_string_formatting.Rmd index caa4754b..28bdcdc1 100644 --- a/vignettes/general_string_formatting.Rmd +++ b/vignettes/general_string_formatting.Rmd @@ -20,9 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr) library(tidyr) -load('adsl.Rdata') -load('adlb.Rdata') -load('adae.Rdata') ``` A key focus of producing a clinical table is ensuring that the formatting of the table is in line with the statistician and clinician's expectations. Organizations often have strict standards around this which vary between organizations. Much of this falls outside the scope of **Tplyr**, but **Tplyr** gives _great_ focus to how the numeric results on the page are formatted. R has vast capabilities when it comes to HTML and interactive tables, but **Tplyr's** focus on string formatting is designed for those traditional, PDF document printable pages. The aim to make it as simple as possible to get what you need to work with a typical monospace fonts. @@ -34,7 +31,7 @@ _Note: We've still focused on R's interactive capabilities, so be sure to check Regardless of what layer type you use within **Tplyr**, control of formatting is handled by using format strings. Consider the following example. ```{r example_1} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) %>% set_format_strings( @@ -112,7 +109,7 @@ Note in the format string, the result numbers to be formatted fill the spaces of As detailed in the first example, when using a lower case 'x', the exact width of space allotted by the x's will be preserved. Note the `var1_Placebo` row below. ```{r example_3} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) %>% set_format_strings( @@ -126,7 +123,7 @@ tplyr_table(adsl, TRT01P) %>% Both the integer width for the `n` counts and the space to the right of the opening parenthesis of the `pct` field are preserved. This guarentees that (when using a monospace font) the non-numeric characters within the format strings will remain in the same place. Given that integers don't truncate, if these spaces are undesired, integers will automatically increase width. In the example below, if the `n` or `pct` result exceeds 10, the width of the output string automatically expands. You can trigger this behaivor by using a single 'x' in the integer side of a format group. ```{r example_4} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) %>% set_format_strings( @@ -143,7 +140,7 @@ The downside of the last example is that alignment between format groups is comp ```{r example_5} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) %>% set_format_strings( @@ -169,7 +166,7 @@ Lastly, **Tplyr** also has the capability to automatically determine some widths Consider the following example. ```{r example_6} -tplyr_table(adlb, TRTA, where=PARAMCD %in% c("CA", "URATE")) %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD %in% c("CA", "URATE")) %>% add_layer( group_desc(AVAL, by=vars(PARAMCD, AVISIT)) %>% set_format_strings( @@ -186,7 +183,7 @@ Note that the decimal precision varies between different lab test results. This For count layers, auto-precision can also be used surrounding the `n` counts. For example, the default format string for counts layers in **Tplyr** is set as `a (xxx.x%)`. This will auto-format the `n` result based on the maximum summarized value of `n` within the data. For example: ```{r example_7} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) %>% set_format_strings(f_str("a (xxx.x%)", n, pct)) @@ -200,8 +197,8 @@ Given that the maximum count was >=10 and <100, the integer width for `n` was as For both layer types, a capital `A` follows the same logic as `X`, but is triggered using auto-precision. Take this example of an adverse event table: ```{r example_8} -tplyr_table(adae, TRTA) %>% - set_pop_data(adsl) %>% +tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% add_layer( group_count(AEDECOD) %>% diff --git a/vignettes/layer_templates.Rmd b/vignettes/layer_templates.Rmd index 3c1bd5f9..021fc0b6 100644 --- a/vignettes/layer_templates.Rmd +++ b/vignettes/layer_templates.Rmd @@ -17,7 +17,6 @@ knitr::opts_chunk$set( ```{r setup, echo=FALSE} library(Tplyr) library(knitr) -load('adsl.Rdata') ``` There are several scenarios where a layer template may be useful. Some tables, like demographics tables, may have many layers that will all essentially look the same. Categorical variables will have the same count layer settings, and continuous variables will have the same desc layer settings. A template allows a user to build those settings once per layer, then reference the template when the **Tplyr** table is actually built. Another scenario might be building a set of company layer templates that are built for standard tables to reduce the footprint of code across analyses. In either of these cases, the idea is the reduce the amount of redundant code necessary to create a table. @@ -41,7 +40,7 @@ new_layer_template( In this example, we've created a basic layer template. The template is named "example_template", and this is the name we'll use to reference the template when we want to use it. When the template is created, we start with the function `group_count(...)`. Note the use of the ellipsis (i.e. `...`). This is a required part of a layer template. Templates must start with a **Tplyr** layer constructor, which is one of the function `group_count()`, `group_desc()`, or `group_shift()`. The ellipsis is necessary because when the template is used, we are able to pass arguments directly into the layer constructor. For example: ```{r using a template} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( use_template("example_template", RACE, by=ETHNIC) ) %>% @@ -54,7 +53,7 @@ Within `use_template()`, the first parameter is the template name. After that, w An additional feature of layer templates is that they act just as any other function would in a **Tplyr** layer. This means that they're also extensible and can be expanded on directly within a **Tplyr** table. For example: ```{r extending a template} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( use_template("example_template", RACE) %>% add_total_row() @@ -83,7 +82,7 @@ In this example, we create a template similar to the first example. But now we a To specify these arguments when using the templater, we use the `use_template()` argument `add_params`. For example: ```{r using params} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( use_template('example_params', RACE, add_params = list( diff --git a/vignettes/metadata.Rmd b/vignettes/metadata.Rmd index 9edf5cf4..abdf4e78 100644 --- a/vignettes/metadata.Rmd +++ b/vignettes/metadata.Rmd @@ -22,7 +22,6 @@ library(tidyr) library(magrittr) library(Tplyr) library(knitr) -load("adsl.Rdata") ``` **Tplyr** has a bit of a unique design, which might feel a bit weird as you get used to the package. The process flow of building a `tplyr_table()` object first, and then using `build()` to construct the data frame is different than programming in the tidyverse, or creating a ggplot. Why create the `tplyr_table()` object first? Why is the `tplyr_table()` object different than the resulting data frame? @@ -34,7 +33,7 @@ The purpose of the `tplyr_table()` object is to let **Tplyr** do more than just Consider the following example: ```{r table_creation} -t <- tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% +t <- tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% add_layer( group_count(RACE) ) %>% @@ -100,7 +99,7 @@ The results are provided this was so that they can be unpacked directly into `dp ```{r unpack} m <- get_meta_result(t, 'd1_2', 'var1_Xanomeline High Dose') -adsl %>% +tplyr_adsl %>% filter(!!!m$filters) %>% select(!!!m$names) %>% head(10) %>% @@ -112,7 +111,7 @@ _Note: Trimmed for space_ But - who says you can't let your imagination run wild? ```{r to string print, eval=FALSE} -cat(c("adsl %>%\n", +cat(c("tplyr_adsl %>%\n", " filter(\n ", paste(purrr::map_chr(m$filters, ~ rlang::as_label(.)), collpase=",\n "), ") %>%\n", @@ -120,9 +119,54 @@ cat(c("adsl %>%\n", )) ``` +### Anti Joins + +Most data presented within a table refers back to the target dataset from which data are being summarized. In some cases, data presented may refer to information _excluded_ from the summary. This is the case when you use the **Tplyr** function `add_missing_subjects_row()`. In this case, the counts presented refer to data excluded from the target which are present in the population data. The metadata thus needs to refer to that excluded data. To handle this, there's an additional field called an 'Anti Join'. Consider this example: + +```{r anti_join1} +t <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + set_distinct_by(USUBJID) %>% + add_missing_subjects_row(f_str("xx (XX.x%)", distinct_n, distinct_pct), sort_value = Inf) + ) + +x <- build(t, metadata=TRUE) + +tail(x) %>% + select(starts_with('row'), var1_Placebo) %>% + kable() +``` + +The missing row in this example counts the subjects within their respective treatment groups who do *not* have any adverse events for the body system "SKIN AND SUBCUTANEOUS TISSUE DISORDERS". Here's what the metadata for the result for the Placebo treatment group looks like. + +```{r anti_join2} +m <- get_meta_result(t, 'c23_1', 'var1_Placebo') +m +``` + +This result has the addition field of 'Anti-join'. This element has two fields, which are the join metadata, and the "on" field, which specifies a merging variable to be used when "anti-joining" with the target data. The join metadata here refers to the data of interest from the population data. Note that while the metadata for the target data has variable names and filter conditions referring to AEBODSYS and AEDECOD, these variables are _not_ present within the join metadata, because that information is not present within the population data. + +While the usual joins we work with focus on the overlap between two sets, an anti-join looks at the non-overlap. The metadata provided here will specifically give us "The subjects within the Placebo treatment group who do **not** have an adverse event within the body system 'SKIN AND SUBCUTANEOUS TISSUE DISORDERS'". + +Extracting this metadata works very much the same way as extracting other results. + +```{r anti_join3} +head(get_meta_subset(t, 'c23_1', 'var1_Placebo')) +``` + +If you're not working with the `tplyr_table` object, then there's some additional information you need to provide to the function. + +```{r anti_join4} +head(get_meta_subset(t$metadata, 'c23_1', 'var1_Placebo', + target=t$target, pop_data=t$pop_data)) +``` + ``` ```{r to string content, results='asis', echo=FALSE} -cat(c("adsl %>%\n", +cat(c("tplyr_adsl %>%\n", " filter(\n ", paste(purrr::map_chr(m$filters, ~ rlang::as_label(.)), collpase=",\n "), ") %>%\n", @@ -135,7 +179,7 @@ cat(c("adsl %>%\n", So we get get metadata around a result cell, and we can get the exact results from a result cell. You just need a row ID and a column name. But - what does that get you? You can query your tables - and that's great. But how do you _use_ that. -The idea behind this is really to support [Shiny](https://shiny.rstudio.com/). Consider this minimal application. Click any of the result cells within the table and see what happens. +The idea behind this is really to support [Shiny](https://shiny.posit.co/). Consider this minimal application. Click any of the result cells within the table and see what happens. ```{r, out.width=850, out.extra='style="border: 1px solid #464646;" allowfullscreen="" allow="autoplay"', echo=FALSE} diff --git a/vignettes/options.Rmd b/vignettes/options.Rmd index 529c5cd5..e89c0b18 100644 --- a/vignettes/options.Rmd +++ b/vignettes/options.Rmd @@ -21,9 +21,6 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adae.Rdata") -load("adsl.Rdata") -load("adlb.Rdata") op <- options() ``` @@ -67,7 +64,7 @@ options( Here you can see that **Tplyr** picks up these option changes. In the table below, we didn't use `set_format_strings()` anywhere - instead we let **Tplyr** pick up the default formats from the options. ```{r default_formats2} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE, by = "Age (years)") ) %>% @@ -88,7 +85,7 @@ One important thing to understand about how these options work in particular is To demonstrate, consider the following. The **Tplyr** options remain set from the block above. ```{r scoping1} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% set_count_layer_formats(n_counts = f_str("xx (xxx%)", n, pct)) %>% set_desc_layer_formats("Mean (SD)" = f_str("a.a+1 (a.a+2)", mean, sd)) %>% add_layer( @@ -124,7 +121,7 @@ options(tplyr.precision_cap = c('int'=2, 'dec'=2)) Similar to the layer defaults, setting a precision cap at the layer level will override the `tplyr.precision_cap` option. ```{r precision_cap2} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(HEIGHTBL, by = "Height at Baseline") %>% set_format_strings( @@ -166,7 +163,7 @@ Note that the table code used to produce the output is the same. Now **Tplyr** u Now that geometric mean is set within the **Tplyr** options, you can use it within your descriptive statistics layers, just like it was one of the built-in summaries. ```{r custom_summaries2} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(AGE) %>% set_format_strings('Geometric Mean' = f_str('xx.xx', geometric_mean)) @@ -203,7 +200,7 @@ options(op) ```{r scipen2} options(tplyr.scipen = -3) -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% add_risk_diff(c('Xanomeline Low Dose', 'Placebo')) @@ -232,7 +229,7 @@ $$ The example below demonstrates using the default quantile algorithm in R ```{r quantile1} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(CUMDOSE) %>% set_format_strings("Q1, Q3" = f_str('xxxxx, xxxxx', q1, q3)) @@ -251,7 +248,7 @@ $$ ```{r quantile2} options(tplyr.quantile_type = 3) -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_desc(CUMDOSE) %>% set_format_strings("Q1, Q3" = f_str('xxxxx, xxxxx', q1, q3)) diff --git a/vignettes/post_processing.Rmd b/vignettes/post_processing.Rmd index 0578797c..ca676397 100644 --- a/vignettes/post_processing.Rmd +++ b/vignettes/post_processing.Rmd @@ -20,8 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr) library(knitr) -load('adsl.Rdata') -load('adae.Rdata') ``` We've made a large effort to make **Tplyr** tables flexible, but not everything can (or, in some cases, we think should) be handled during table construction itself. To address this, **Tplyr** has several post-processing functions that help put finishing touches on your data to help with presentation. @@ -58,7 +56,7 @@ _Note: We're viewing the data frame output here because HTML based outputs elimi Row masking is the process blanking of repeat row values within a data frame to give the appearance of grouping variables. Some table packages, such as [**gt**](https://gt.rstudio.com/), will handle this for you. Other packages, like [**huxtable**](https://hughjonesd.github.io/huxtable/), have options like merging cells, but this may be a more simplistic approach. Furthermore, this is a common approach in clinical tables when data validation is done on an output dataframe. ```{r row_mask1} -dat <- tplyr_table(adsl, TRT01P) %>% +dat <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE, by = "Race n (%)") ) %>% @@ -79,7 +77,7 @@ dat %>% A second feature of `apply_row_masks()` is the ability to apply row breaks between different groups of data, for example, different layers of a table. ```{r row_masks3} -dat <- tplyr_table(adsl, TRT01P) %>% +dat <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE, by = "Race n (%)") ) %>% @@ -100,6 +98,82 @@ There are a few considerations when using `apply_row_masks()`: - This function is order dependent, so make sure your data are sorted before submitting to `apply_row_masks()` - When inserting row breaks, by default the Tpylr variable `ord_layer_index` is used. You can submit other variables via the ellipsis parameter (`...`) if you'd like to use a different variable grouping to insert rows +## Collapsing Row Labels + +Different table formats call for different handling of row labels, depending on the preferences of an individual organization and the specifics of the table at hand. **Tplyr** inherently creates row labels as separate columns, but similar to the way that count layers nest the inner and the outer layer, we also offer the `collapse_row_labels()` function to pull multiple row labels into a single column. + +```{r collapse_row_labels} +dat <- tplyr_table(tplyr_adsl, TRT01P) %>% + add_layer( + group_count(RACE, by = vars("Race n (%)", SEX)) + ) %>% + add_layer( + group_desc(AGE, by = vars("Age (years)", SEX)) + ) %>% + build() + +collapse_row_labels(dat, row_label1, row_label2, row_label3) %>% + select(row_label, var1_Placebo) +``` +By default, indentation is set to 2 spaces, but by using the `indent` parameter you can change this to any string you desire. + +```{r collapse_row_labels2} +collapse_row_labels(dat, row_label1, row_label2, row_label3, indent = "  ") %>% + select(row_label, var1_Placebo) %>% + kable(escape=FALSE) +``` +You also have control over which columns you collapse, allowing you to keep separate row labels if you don't want all collapsed together + +```{r collapse_row_labels3} +collapse_row_labels(dat, row_label1, row_label2, indent = "  ") %>% + select(row_label, row_label3, var1_Placebo) %>% + head() %>% + kable() +``` + +## Leading Spaces in HTML Files + +Another helper function we've made available is `replace_leading_whitespace()`. In the table created above, note that the `indent` parameter was set using ` `, which is a non-breaking space. This can be used in HTML files to preserve leading white spaces instead of automatically stripping them in the display, as viewing utilities usually do. Ever noticed that in your data viewers you typically don't see leading spaces? Yeah - that's why! + +Let's take the example from above and not change the `indent` parameter. + +```{r replace_leading_whitespace1} +collapse_row_labels(dat, row_label1, row_label2) %>% + select(row_label, row_label3, var1_Placebo) %>% + kable() +``` + +In indented rows, the spaces still exist, and we can see that in the dataframe output itself. + +```{r replace_leading_whitespace2} +collapse_row_labels(dat, row_label1, row_label2) %>% + select(row_label, row_label3, var1_Placebo) %>% + head() +``` + +But the HTML view strips them off when we pass it into the `kable()` function. `replace_leading_whitespace()` will take care of this for us by converting the spaces. Note that you'll see the ` ` in the raw data itself. + +```{r replace_leading_whitespace3} +collapse_row_labels(dat, row_label1, row_label2) %>% + select(row_label, row_label3, var1_Placebo) %>% + mutate( + across(where(is.character), ~ replace_leading_whitespace(.)) + ) %>% + head() +``` + +But now when we want to use this in a display, the ` ` characters will show as leading whitespace within our HTML table. Note that you'll need to prevent escaping special characters for this to work, or the raw text will display. In `kable()` you can use `escape=FALSE` do this. + +```{r replace_leading_whitespace4} +collapse_row_labels(dat, row_label1, row_label2) %>% + select(row_label, row_label3, var1_Placebo) %>% + mutate( + across(where(is.character), ~ replace_leading_whitespace(.)) + ) %>% + head() %>% + kable(escape=FALSE) +``` + ## Conditional Formatting In some circumstances, like `add_total_row()`, **Tplyr** lets you specify special formats separate from those in `set_format_strings()`. But within the table body there's no other way to set specific, conditional formats based on the table data itself. To address this, we've added the post-processing function `apply_conditional_format()` to allow you to set conditional formats on result cells. @@ -177,7 +251,7 @@ In the first call to `str_extract_fmt_group()`, we target the n counts. The firs In practice, `str_extract_fmt_group()` can then be used to separate format groups into their own columns. ```{r fmt_group2} -dat <- tplyr_table(adsl, TRT01P) %>% +dat <- tplyr_table(tplyr_adsl, TRT01P) %>% add_layer( group_count(RACE) ) %>% @@ -204,8 +278,8 @@ In very much the same vein as `str_extract_fmt_group()`, the function `str_extra Consider an adverse event table. In `vignette("sort")` we go over circumstances where you may want to sort by the descending occurrence of a result. We've received questions about how to establish tie breakers in this scenario, where ties should be broken sorting descending occurrence of an adverse event within the high dose group, then the low dose group, and finally the placebo group. **Tplyr** doesn't allow you to output these order variables by default, but getting these numbers is quite simple with `str_extract_num()`. Let's consider a simplified scenario ```{r num1} -dat <- tplyr_table(adae, TRTA) %>% - set_pop_data(adsl) %>% +dat <- tplyr_table(tplyr_adae, TRTA) %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% add_layer( group_count(AEDECOD) %>% diff --git a/vignettes/riskdiff.Rmd b/vignettes/riskdiff.Rmd index 296338da..ba443534 100644 --- a/vignettes/riskdiff.Rmd +++ b/vignettes/riskdiff.Rmd @@ -21,8 +21,6 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adae.Rdata") -load("adsl.Rdata") ``` **Tplyr** does not support, nor do we intend to support, a wide array of statistical methods. Our goal is rather to take your focus as an analyst off the mundane summaries so you can focus on the interesting analysis. That said, there are some things that are common enough that we feel that it's reasonable for us to include. So let's take a look at risk difference. @@ -34,7 +32,7 @@ Our current implementation of risk difference is solely built on top of the base Risk difference is built on top of count layers, as it's a comparison of proportions. To add a risk difference calculation into a count layer, you simply use the function `add_risk_diff()`. We made a large effort to make this flow very naturally with the count layer construction, so let's walk through it step by step. ```{r riskdiff1} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -68,7 +66,7 @@ The default values presented within formatted strings in the built table will be You have a good bit of control over these values though, and this can be controlled in the same way you format the count summaries - using `set_format_strings()`. ```{r riskdiff2} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -100,7 +98,7 @@ Take a look at the `rdiff` columns now - you'll see they have 5 values. These ar You have the same control over the formatting of the display of these values here as you do with the count summaries. Taking things a step further, you can also pass forward arguments to `stats::prop.test()` using a named list and the `args` argument in `add_risk_diff()`. This wasn't done using the ellipsis (i.e. `...`) like typical R functions because it's already used to capture a varying number of comparisons, but it's not much more difficult to use: ```{r riskdiff3} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -135,8 +133,8 @@ For more information on these parameters, see the documentation for `stats::prop The default of `add_risk_diff()` works on the distinct counts available within the count summary. ```{r riskdiff4} -t <- tplyr_table(adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS") %>% - set_pop_data(adsl) %>% +t <- tplyr_table(tplyr_adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS") %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% set_pop_where(TRUE) %>% add_layer( @@ -157,7 +155,7 @@ suppressWarnings(build(t)) %>% If for whatever reason you'd like to run risk difference on the non-distinct counts, switch the `distinct` argument to FALSE. `add_risk_diff()` also will function on multi-level summaries no different than single level, so no concerns there either. ```{r riskdiff5} -t <- tplyr_table(adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS") %>% +t <- tplyr_table(tplyr_adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS") %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -178,7 +176,7 @@ suppressWarnings(build(t)) %>% Risk difference also works with the `cols` argument, but it's important to understand how the comparisons work in these situation. Here, it's still the treatment groups that are compared - but the column argument is used as a "by" variable. For example: ```{r riskdiff6} -t <- tplyr_table(adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS", cols=SEX) %>% +t <- tplyr_table(tplyr_adae, TRTA, where= AEBODSYS == "SKIN AND SUBCUTANEOUS TISSUE DISORDERS", cols=SEX) %>% add_layer( group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% @@ -208,7 +206,7 @@ The output of `get_stats_data()` depends on what parameters have been used: This works best when layers are named, as it makes the output much clearer. ```{r riskdiff7} -t <- tplyr_table(adae, TRTA) %>% +t <- tplyr_table(tplyr_adae, TRTA) %>% add_layer(name="PreferredTerm", group_count(AEDECOD) %>% set_distinct_by(USUBJID) %>% diff --git a/vignettes/shift.Rmd b/vignettes/shift.Rmd index 05094358..43a30e8d 100644 --- a/vignettes/shift.Rmd +++ b/vignettes/shift.Rmd @@ -20,9 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr, warn.conflicts = FALSE) library(knitr) -load("adsl.Rdata") -load("adae.Rdata") -load("adlb.Rdata") ``` Shift tables are a special kind of frequency table - but what they count are changes in state. This is most common when looking at laboratory ranges, where you may be interested in seeing how a subject's results related to normal ranges. The 'change in state' would refer to how that subject's results were at baseline versus different points of measure. Shift tables allow you to see the distribution of how subjects move between normal ranges, and if the population is improving or worsening as the study progresses. @@ -41,7 +38,7 @@ One thing to note - the `group_shift()` API is intended to be used on shift tabl Let's look at an example. ```{r} -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, VISIT)) ) %>% @@ -57,9 +54,9 @@ For the most part, the last example gets us where we want to go - but there's st ## Filling Missing Groups Using Factors ```{r} -adlb$ANRIND <- factor(adlb$ANRIND, levels=c("L", "N", "H")) -adlb$BNRIND <- factor(adlb$BNRIND, levels=c("L", "N", "H")) -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_adlb$ANRIND <- factor(tplyr_adlb$ANRIND, levels=c("L", "N", "H")) +tplyr_adlb$BNRIND <- factor(tplyr_adlb$BNRIND, levels=c("L", "N", "H")) +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, VISIT)) ) %>% diff --git a/vignettes/sort.Rmd b/vignettes/sort.Rmd index 57d3a38e..7405a8c7 100644 --- a/vignettes/sort.Rmd +++ b/vignettes/sort.Rmd @@ -20,9 +20,6 @@ knitr::opts_chunk$set( library(Tplyr) library(dplyr, warn.conflicts = FALSE) library(knitr) -load("adsl.Rdata") -load("adae.Rdata") -load("adlb.Rdata") ``` At surface level - sorting a table may seem easy, and in many cases it is. But in a handful of cases it can get quite tricky, with some odd situations that need to be handled carefully. For this reason, we found it necessary to dedicate an entire vignette to just sorting and handling columns output by **Tplyr**. @@ -30,7 +27,7 @@ At surface level - sorting a table may seem easy, and in many cases it is. But i Let's start by looking at an example. ```{r} -t <- tplyr_table(adsl, TRT01A) %>% +t <- tplyr_table(tplyr_adsl, TRT01A) %>% add_total_group() %>% add_treat_grps(Treated = c("Xanomeline Low Dose", "Xanomeline High Dose")) %>% add_layer( @@ -124,8 +121,8 @@ These order variables will calculate based on the first applicable method below. If there's no `VARN` variable in the target dataset, **Tplyr** will then check if the variable you provided is a factor. If you're new to R, spending some time trying to understand factor variables is quite worthwhile. Let's look at example using the variable `ETHNIC` and see some of the advantages in practice. ```{r} -adsl$ETHNIC <- factor(adsl$ETHNIC, levels=c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO", "DUMMMY")) -tplyr_table(adsl, TRT01A) %>% +tplyr_adsl$ETHNIC <- factor(tplyr_adsl$ETHNIC, levels=c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO", "DUMMMY")) +tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_count(EOSSTT, by = ETHNIC) ) %>% @@ -139,7 +136,7 @@ Factor variables have 'levels'. These levels are essentially what the `VARN` var A highly advantageous aspect of using factor variables in **Tplyr** is that factor variables can be used to insert dummy values into your table. Consider this line of code from above: ``` -adsl$ETHNIC <- factor(adsl$ETHNIC, levels=c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO", "DUMMMY")) +tplyr_adsl$ETHNIC <- factor(tplyr_adsl$ETHNIC, levels=c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO", "DUMMMY")) ``` This is converting the variable `ETHNIC` to a factor, then setting the factor levels. But it doesn't _change_ any of the values in the dataset - there are no values of "dummy" within `ETHNIC` in ADSL. Yet in the output built above, you see rows for "DUMMY". By using factors, you can insert rows into your **Tplyr** table that don't exist in the data. This is particularly helpful if you're working with data early on in a study, where certain values are expected, yet do not currently exist in the data. This will help you prepare tables that are complete even when your data are not. @@ -149,7 +146,7 @@ This is converting the variable `ETHNIC` to a factor, then setting the factor le To demonstrate the use of `VARN` sorting, consider the variable `RACE.` In `ADSL`, `RACE` also has `RACEN`: ```{r} -adsl %>% +tplyr_adsl %>% distinct(RACEN, RACE) %>% kable() ``` @@ -157,7 +154,7 @@ adsl %>% **Tplyr** will automatically figure this out for you, and pull the `RACEN` values into the variable `ord_layer_1`. ```{r} -tplyr_table(adsl, TRT01A) %>% +tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_count(EOSSTT, by = RACE) ) %>% @@ -176,7 +173,7 @@ Lastly, If the target doesn't have a `VARN` variable in the target dataset and i After the `by` variables, each layer will sort results slightly differently. We'll start with the most simple case - descriptive statistic layers. As the user, you have full control over the order in which results present using `set_format_strings()`. Results will be ordered based on the order in which you create your `f_str()` objects. ```{r} -tplyr_table(adsl, TRT01A) %>% +tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_desc(HEIGHTBL) %>% set_format_strings( @@ -206,10 +203,10 @@ The order in which results appear on a frequency table can be deceptively comple "byfactor" is the default ordering method of results for count layers. Both "byfactor" and "byvarn" behave exactly like the order variables associated with `by` variables in a **Tplyr** table. For "byvarn", you must set the sort method using `set_order_count_method()`. ```{r} -adsl$AGEGR1 <- factor(adsl$AGEGR1, c("<65", "65-80", ">80")) +tplyr_adsl$AGEGR1 <- factor(tplyr_adsl$AGEGR1, c("<65", "65-80", ">80")) # Warnings suppressed to remove 'forcats' implicit NA warning suppressWarnings({ - tplyr_table(adsl, TRT01A) %>% + tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_count(AGEGR1) %>% # This is the default and not needed @@ -222,7 +219,7 @@ suppressWarnings({ ``` ```{r} -tplyr_table(adsl, TRT01A) %>% +tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_count(RACE) %>% set_order_count_method("byvarn") @@ -242,7 +239,7 @@ Using count-based sorting is where things get more complicated. There are multip We've created helper functions to aid in making this step more intuitive from a user perspective, and to maintain the flexibility that you need. The two functions that you need here are `set_ordering_cols()` and `set_result_order_var()`. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(AEDECOD) %>% # This will present 3 numbers in a cell @@ -270,7 +267,7 @@ The next question that we need to answer when sorting by counts is which result But what if you have an additional column variable on top of the treatment groups? ```{r} -tplyr_table(adae, TRTA, cols=SEX) %>% +tplyr_table(tplyr_adae, TRTA, cols=SEX) %>% add_layer( group_count(AEDECOD) %>% # This will present 3 numbers in a cell @@ -298,7 +295,7 @@ Here we're ordering on the female subjects in the "Xanomeline High Dose" cohort. Nested count layers add one more piece to the puzzle. As a reminder, nested count layers are count summaries that are summarizing both a grouping variable, and a variable that's being grouped. The best example is probably Adverse Event tables, where we want to see adverse events that occurred within different body systems. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(vars(AEBODSYS, AEDECOD)) ) %>% @@ -313,7 +310,7 @@ In a layer that uses nesting, we need one more order variable - as we're now con These result variables will always be the last two order variables output by **Tplyr**. In the above example, `ord_layer_1` is for `AEBODSYS` and `ord_layer_2` is for `AEDECOD`. Note that `ord_layer_2` has `Inf` where `row_label1` and `row_label2` are both equal. This is the row that summarizes the `AEBODSYS` counts. By default, **Tplyr** is set to assume that you will use **descending** sort on the order variable associated with the inside count variable (i.e. `AEDECOD`). This is because in nested count layer you will often want to sort by descending occurrence of the inside target variable. If you'd like to use ascending sorting instead, we offer the function `set_outer_sort_position()`. ```{r} -tplyr_table(adae, TRTA) %>% +tplyr_table(tplyr_adae, TRTA) %>% add_layer( group_count(vars(AEBODSYS, AEDECOD)) %>% set_outer_sort_position("asc") @@ -330,7 +327,7 @@ Notice that the `Inf` has now switched to `-Inf` to ensure that the `AEBODSYS` r Another consideration of nested sorting is whether or not you want to sort both result variables the same way. Do you want to sort both by counts? Or do you want to sort one alphabetically and the other by count? Or maybe one has a `VARN` variable associated with it? For this reason, `set_order_count_method()` can take in a 2-element character vector, where the first element specifies the outside variable and the second the inside variable. ```{r} -tplyr_table(adsl, TRT01A) %>% +tplyr_table(tplyr_adsl, TRT01A) %>% add_layer( group_count(vars(EOSSTT, DCDECOD)) %>% set_order_count_method(c("byfactor", "bycount")) @@ -345,7 +342,7 @@ In the example above, `EOSTT` is ordered alphabetically (recall that using "byfa If only one method is provided, that method will automatically be applied to both variables. So in the example below, "bycount" is applied to both `EOSTT` and `DSDECOD`. ```{r} -tplyr_table(adsl, TRT01A) %>% +tplyr_table(tplyr_adsl, TRT01A) %>% add_total_group() %>% add_layer( group_count(vars(EOSSTT, DCDECOD)) %>% @@ -364,7 +361,7 @@ tplyr_table(adsl, TRT01A) %>% Shift tables keep things relatively simple when it comes to sorting and use the "byfactor" method seen above. We encourage this primarily because you likely want the benefits of factor variables on a shift layer. For example, consider this table: ```{r} -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, AVISIT)) ) %>% @@ -382,10 +379,10 @@ There are a few problems here: Using factor variables cleans this right up for us: ```{r} -adlb$BNRIND <- factor(adlb$BNRIND, levels=c("L", "N", "H")) -adlb$ANRIND <- factor(adlb$ANRIND, levels=c("L", "N", "H")) +tplyr_adlb$BNRIND <- factor(tplyr_adlb$BNRIND, levels=c("L", "N", "H")) +tplyr_adlb$ANRIND <- factor(tplyr_adlb$ANRIND, levels=c("L", "N", "H")) -tplyr_table(adlb, TRTA, where=PARAMCD == "CK") %>% +tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% add_layer( group_shift(vars(row = BNRIND, column = ANRIND), by = vars(PARAM, AVISIT)) ) %>% diff --git a/vignettes/styled-table.Rmd b/vignettes/styled-table.Rmd index 31cb89c5..3d1a73df 100644 --- a/vignettes/styled-table.Rmd +++ b/vignettes/styled-table.Rmd @@ -21,7 +21,6 @@ library(Tplyr) library(dplyr, warn.conflicts = FALSE) library(knitr) library(kableExtra) -load("adsl.Rdata") ``` In the other vignettes we talk about how to get the most out of **Tplyr** when it comes to preparing your data. The last step we need to cover is how to get from the data output by **Tplyr** to a presentation ready table. @@ -43,14 +42,14 @@ Let's build a demographics table to see how this all works. ## Preparing the data ```{r demog_table} -adsl <- adsl %>% +tplyr_adsl <- tplyr_adsl %>% mutate( SEX = recode(SEX, M = "Male", F = "Female"), RACE = factor(RACE, c("AMERICAN INDIAN OR ALASKA NATIVE", "ASIAN", "BLACK OR AFRICAN AMERICAN", "NATIVE HAWAIIN OR OTHER PACIFIC ISLANDER", "WHITE", "MULTIPLE")) ) -t <- tplyr_table(adsl, TRT01P) %>% +t <- tplyr_table(tplyr_adsl, TRT01P) %>% add_total_group() %>% add_layer(name = 'Sex', group_count(SEX, by = "Sex n (%)") %>% diff --git a/vignettes/table.Rmd b/vignettes/table.Rmd index 0d4ab526..bad21d72 100644 --- a/vignettes/table.Rmd +++ b/vignettes/table.Rmd @@ -21,8 +21,6 @@ library(tidyverse) library(magrittr) library(Tplyr) library(knitr) -load("adae.Rdata") -load("adsl.Rdata") ``` Most of the work in creating a **Tplyr** table is at the layer level, but there are a few overarching properties that are worth spending some time discussing. One of the things that we wanted to make sure we did in **Tplyr** is allow you to eliminate redundant code wherever possible. Adding some processing to the `tplyr_table()` level allows us to do that. Furthermore, some settings simply need to be applied table wide. @@ -39,7 +37,7 @@ The `tplyr_table()` function has 4 parameters: Let's look at an example: ```{r table_params1} -tplyr_table(adsl, TRT01P, where= SAFFL =="Y", cols = SEX) %>% +tplyr_table(tplyr_adsl, TRT01P, where= SAFFL =="Y", cols = SEX) %>% add_layer( group_count(RACE, by = "Race") ) %>% @@ -55,7 +53,7 @@ In the example above, the `where` parameter is passed forward into both the `RAC _Note: Treatment groups and additional column variables presented in the final output are always taken from the **pre-filtered** population data. This means that if a filter completed excludes a treatment group or group within a column variable, columns will still be created for those groups and will be empty/zero filled._ ```{r table_params2} -tplyr_table(adsl, TRT01P, where= SAFFL =="Y", cols = vars(SEX, RACE)) %>% +tplyr_table(tplyr_adsl, TRT01P, where= SAFFL =="Y", cols = vars(SEX, RACE)) %>% add_layer( group_desc(AGE, by = "Age (Years)") ) %>% @@ -73,7 +71,7 @@ Another important feature that works at the table level is the addition of treat We've added the function `add_treat_grps()` to do this work for you. With this function, you can create new treatment groups by combining existing treatment groups from values within `treat_var`. Additionally, to simplify the process we added an abstraction of `add_treat_grps()` named `add_total_group()` to simplify the process of creating a "Total" group. ```{r treat_grps} -tplyr_table(adsl, TRT01P) %>% +tplyr_table(tplyr_adsl, TRT01P) %>% add_treat_grps('Treated' = c("Xanomeline High Dose", "Xanomeline Low Dose")) %>% add_total_group() %>% add_layer( @@ -92,8 +90,8 @@ A last and very important aspect of table level properties in **Tplyr** is the a **Tplyr** allows you to provide a separate population dataset to overcome this. Furthermore, you are also able to provide a separate population dataset `where` parameter and a population treatment variable named `pop_treat_var`, as variable names may differ between the datasets. ```{r pop_data1} -t <- tplyr_table(adae, TRTA, where = AEREL != "NONE") %>% - set_pop_data(adsl) %>% +t <- tplyr_table(tplyr_adae, TRTA, where = AEREL != "NONE") %>% + set_pop_data(tplyr_adsl) %>% set_pop_treat_var(TRT01A) %>% set_pop_where(TRUE) %>% add_layer( @@ -115,6 +113,60 @@ header_n(t) %>% kable() ``` +Note: it’s expected the `set_distinct_by()` function is used with population data. This is because it does not make sense to use population data denominators unless you have distinct counts. The entire point of population data is to use subject counts, so non-distinct counts would potentially count multiple records per subject and then the percentage doesn’t make any sense. + +## Data Completion + +When creating summary tables, often we have to mock up the potential values of data, even if those values aren't present in the data we're summarizing. **Tplyr** does its best effort to do this for you. Let's consider the following dataset: + +```{r data_comp, echo=FALSE} +kable(head(tplyr_adpe)) +``` +Let's say we want to create a count summary for this dataset, and report it by PARAM and AVISIT. Note that in the data, `PARAM=="HEAD"` is only collected at screening, while `LUNGS` is collected at Screening, Day -1, and Day 5. + +```{r data_comp1} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(18) %>% + kable() + +``` + +By default, given the `by` variables of PARAM and AVISIT, all of the potential visits have dummy rows created that are 0 filled - meaning results of 0 records for all treatment groups are presented. However, that might not be what you wish to present. Perhaps `HEAD` was only intended to be collected at the Screening visit so it's unnecessary to present other visits. To address this, you can use the `set_limit_data_by()` function. + +```{r data_comp2} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + head(12) %>% + kable() +``` + +Here you can see that now records for `HEAD` only present the screening visit. For count and shift layers, you can additionally dig further in to use target variables: + +```{r data_comp3} +tplyr_table(tplyr_adpe, TRT01A) %>% + add_layer( + group_count(AVALC, by = vars(PARAM, AVISIT)) %>% + set_limit_data_by(PARAM, AVISIT, AVALC) + ) %>% + build() %>% + select(-starts_with('ord')) %>% + kable() +``` + +This effectively limits to the values present in the data itself. + +## Where to Go From Here + With the table level settings under control, now you're ready to learn more about what **Tplyr** has to offer in each layer. - Learn more about descriptive statistics layers in `vignette("desc")`