Skip to content

Commit

Permalink
Merge pull request #139 from atorus-research/gh_issues_136_138
Browse files Browse the repository at this point in the history
Fix Issues #136 and #138
  • Loading branch information
mstackhouse authored Nov 7, 2023
2 parents e037427 + c31a8e2 commit 96911c4
Show file tree
Hide file tree
Showing 19 changed files with 718 additions and 351 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Imports:
tidyselect (>= 1.1.0),
tibble (>= 3.0.1),
lifecycle,
forcats (>= 0.4.0)
forcats (>= 1.0.0)
Suggests:
testthat (>= 2.1.0),
haven (>= 2.2.0),
Expand Down
5 changes: 4 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ importFrom(dplyr,between)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
importFrom(dplyr,cur_column)
importFrom(dplyr,cur_group)
importFrom(dplyr,distinct)
importFrom(dplyr,do)
Expand All @@ -146,6 +147,7 @@ importFrom(dplyr,group_keys)
importFrom(dplyr,if_else)
importFrom(dplyr,lag)
importFrom(dplyr,left_join)
importFrom(dplyr,matches)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_all)
importFrom(dplyr,mutate_at)
Expand All @@ -164,7 +166,7 @@ importFrom(dplyr,vars)
importFrom(forcats,fct_collapse)
importFrom(forcats,fct_drop)
importFrom(forcats,fct_expand)
importFrom(forcats,fct_explicit_na)
importFrom(forcats,fct_na_value_to_level)
importFrom(lifecycle,deprecate_soft)
importFrom(lifecycle,deprecate_stop)
importFrom(magrittr,"%>%")
Expand Down Expand Up @@ -257,6 +259,7 @@ importFrom(tibble,add_column)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
importFrom(tidyr,complete)
importFrom(tidyr,fill)
importFrom(tidyr,nesting)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
Expand Down
56 changes: 56 additions & 0 deletions R/call_standardise.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# # MIT License
#
# Copyright (c) 2020 rlang authors
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

#' rlang deprecated call_standardise, but this is necessary for
#' the `modify_nested_call()` function. The capability isn't
#' matched by the `call_match()` function.
#'
#' In modify nested call, this is used to restructure the function call
#' to have all named arguments. Since all the functions called inside
#' `call_standardise()` are exported and currently stable within rlang,
#' I'm porting the function here.
#'
#' @param call
#'
#' @return call
#' @noRd
tplyr_call_standardise <- function(call, env= rlang::caller_env()) {

expr <- rlang::get_expr(call)

if (!rlang::is_call(expr)) {
stop("call_standardise error")
}

# The call name might be a literal, not necessarily a symbol
env <- rlang::get_env(call, env)
fn <- rlang::eval_bare(rlang::node_car(expr), env)

if (rlang::is_primitive(fn)) {
call
} else {
matched <- match.call(fn, expr)
rlang::set_expr(call, matched)
}

}

159 changes: 105 additions & 54 deletions R/count.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ process_summaries.count_layer <- function(x, ...) {

## Drop levels if target var is factor and kept levels used
if (eval_tidy(keep_levels_logic) &&
is.factor(built_target[[as_name(tail(target_var, 1)[[1]])]])) {
is.factor(built_target[[as_name(tail(target_var, 1)[[1]])]])) {
# Pull out the levels that weren't in keep levels.
target_levels <- levels(built_target[[as_name(tail(target_var, 1)[[1]])]])
drop_levels_ind <- !(target_levels %in% levels_to_keep)
Expand All @@ -72,7 +72,7 @@ process_summaries.count_layer <- function(x, ...) {
}

if (length(target_var) == 2 && !quo_is_symbol(target_var[[2]]) &&
as_name(target_var[[2]]) %in% names(target)) {
as_name(target_var[[2]]) %in% names(target)) {
warning(paste0("The second target variable has been coerced into a symbol.",
"You should pass variable names unquoted."), immediate. = TRUE)

Expand Down Expand Up @@ -139,27 +139,58 @@ process_single_count_target <- function(x) {
# Used to temporarily check formats
if (is.null(format_strings)) tmp_fmt <- gather_defaults.count_layer(current_env())
if (count_missings && !(is.null(denom_ignore) || length(denom_ignore) == 0) &&
(("pct" %in% total_count_format$vars || "distinct_pct" %in% total_count_format$vars) ||
# Logic if no total_count format
(is.null(total_count_format) && is.null(format_strings) && ("pct" %in% tmp_fmt$n_counts$vars || "distinct_pct" %in% tmp_fmt$n_counts$vars)) ||
(is.null(total_count_format) && ("pct" %in% count_fmt$n_counts$vars || "distinct_pct" %in% count_fmt$n_counts$vars))
)
) {
warning("Your total row is ignoring certain values. The 'pct' in this row may not be 100%",
immediate. = TRUE)
}
(("pct" %in% total_count_format$vars || "distinct_pct" %in% total_count_format$vars) ||
# Logic if no total_count format
(is.null(total_count_format) && is.null(format_strings) && ("pct" %in% tmp_fmt$n_counts$vars || "distinct_pct" %in% tmp_fmt$n_counts$vars)) ||
(is.null(total_count_format) && ("pct" %in% count_fmt$n_counts$vars || "distinct_pct" %in% count_fmt$n_counts$vars))
)
) {
warning("Your total row is ignoring certain values. The 'pct' in this row may not be 100%",
immediate. = TRUE)
}
}

if (is.null(count_row_prefix)) count_row_prefix <- ""

if (is.null(denoms_by)) denoms_by <- c(treat_var, cols)
# If a denoms variable is factor then it should be character for the denoms calculations
denoms_df_prep <- denoms_df %>%
mutate(
across(dplyr::where(is.factor), ~as.character(.))
)

# Nested counts might have summary var come through as numeric
if ('summary_var' %in% map_chr(denoms_by, as_name) && is.numeric(denoms_df_prep$summary_var)) {
denoms_df_prep$summary_var <- as.character(denoms_df_prep$summary_var)
}

# But if a summary_stat variable is factor, then the denoms needs to match this
# This happens if sorting was triggered for the variable as a factor
# fct_cols will be a named logical vector of the variable names, where TRUE
# is the summary_stat variables that are factors
fct_cols <- map_lgl(summary_stat, is.factor)

if (any(fct_cols)) {
# From the bool vector of fct_cols, grab the names of the ones that
# are TRUE
# Create a regular expression like var1|var2|var3
fct_cols_ns <- paste(names(fct_cols[fct_cols]), collapse="|")

# Reset each factor variable to have the appropriate levels for the denom
# so that 0 filling can happen appropriately later on
denoms_df_prep <- denoms_df_prep %>%
mutate(
across(matches(fct_cols_ns), ~ factor(., levels=levels(summary_stat[[cur_column()]])))
)

rm(fct_cols_ns)
}

# rbind tables together
numeric_data <- summary_stat %>%
bind_rows(total_stat) %>%
rename("summary_var" = !!target_var[[1]]) %>%
group_by(!!!denoms_by) %>%
do(get_denom_total(., denoms_by, denoms_df, "n")) %>%
do(get_denom_total(., denoms_by, denoms_df_prep, "n")) %>%
mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>%
ungroup()

Expand All @@ -175,6 +206,8 @@ process_single_count_target <- function(x) {
distinct_stat[, c("distinct_n", "distinct_total")])
}

rm(denoms_df_prep, fct_cols)

}, envir = x)
}

Expand Down Expand Up @@ -214,8 +247,8 @@ process_count_n <- function(x) {
# If there is a missing_count_string, but its not in the dataset
if (!is.null(missing_count_string) &&

!((any(unname(unlist(missing_count_list)) %in% unique(built_target[, as_name(target_var[[1]])]))) ||
any(is.na(built_target[, as_name(target_var[[1]])])))) {
!((any(unname(unlist(missing_count_list)) %in% unique(built_target[, as_name(target_var[[1]])]))) ||
any(is.na(built_target[, as_name(target_var[[1]])])))) {
# This adds the missing string as a factor to the tallies. This is needed
# to make sure the missing row is added even if there are no missing values.
summary_stat <- summary_stat %>%
Expand Down Expand Up @@ -318,12 +351,12 @@ prepare_format_metadata.count_layer <- function(x) {
# distinct_by
# If both distinct and n
if (((("distinct_n" %in% map(format_strings$n_counts$vars, as_name) &
"n" %in% map(format_strings$n_counts$vars, as_name)) |
# or both distinct_pct and pct
("distinct_pct" %in% map(format_strings$n_counts$vars, as_name) &
"pct" %in% map(format_strings$n_counts$vars, as_name))) &
# AND distinct_by is null
is.null(distinct_by))) {
"n" %in% map(format_strings$n_counts$vars, as_name)) |
# or both distinct_pct and pct
("distinct_pct" %in% map(format_strings$n_counts$vars, as_name) &
"pct" %in% map(format_strings$n_counts$vars, as_name))) &
# AND distinct_by is null
is.null(distinct_by))) {
stop("You can't use distinct and non-distinct parameters without specifying a distinct_by")
}

Expand Down Expand Up @@ -389,11 +422,11 @@ process_formatting.count_layer <- function(x, ...) {
has_missing_count = has_missing_count)
}) %>%
# Pivot table
pivot_wider(id_cols = c(match_exact(by), "summary_var"),
pivot_wider(id_cols = c(match_exact(by), "summary_var"),
names_from = c(!!treat_var, match_exact(cols)), values_from = n,
names_prefix = "var1_") %>%
# Replace the by variables and target variable names with `row_label<n>`
replace_by_string_names(quos(!!!by, summary_var))
# Replace the by variables and target variable names with `row_label<n>`
replace_by_string_names(quos(!!!by, summary_var))

if (is_built_nest) {
# I had trouble doing this in a 'tidy' way so I just did it here.
Expand Down Expand Up @@ -491,7 +524,7 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
# Iterate over every variable
for (i in seq_along(vars_ord)) {
str_all[[i + 1]] <- count_string_switch_help(vars_ord[i], count_fmt, .n[!missing_rows & !total_rows], .total[!missing_rows & !total_rows],
.distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord)
.distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord)
}


Expand All @@ -501,12 +534,12 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
missing_str_all[1] <- missing_f_str$repl_str
for (i in seq_along(missing_vars_ord)) {
missing_str_all[[i + 1]] <- count_string_switch_help(missing_vars_ord[i],
missing_f_str,
.n[missing_rows],
.total[missing_rows],
.distinct_n[missing_rows],
.distinct_total[missing_rows],
missing_vars_ord)
missing_f_str,
.n[missing_rows],
.total[missing_rows],
.distinct_n[missing_rows],
.distinct_total[missing_rows],
missing_vars_ord)
}

total_str_all <- vector("list", 5)
Expand All @@ -516,9 +549,9 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
total_count_format,
.n[total_rows],
.total[total_rows],
.distinct_n[total_rows],
.distinct_total[total_rows],
total_vars_ord)
.distinct_n[total_rows],
.distinct_total[total_rows],
total_vars_ord)
}

# Put the vector strings together. Only include parts of str_all that aren't null
Expand Down Expand Up @@ -658,8 +691,8 @@ process_count_denoms <- function(x) {
"` is invalid. Filter error:\n", e))
})

# For distinct counts, we want to defer back to the
# population dataset. Trigger this by identifying that
# For distinct counts, we want to defer back to the
# population dataset. Trigger this by identifying that
# the population dataset was overridden
if (!isTRUE(try(identical(pop_data, target)))) {
# If the denom_where doesn't match the where AND the denom_where isn't true
Expand All @@ -675,30 +708,48 @@ process_count_denoms <- function(x) {
group_by(!!!layer_params[param_apears]) %>%
summarize(
n = n()
) %>%
) %>%
ungroup()

# If user specified treatment var as a denom by then remove it
# and if inside a nested layer, rename summary_var in the denoms_by
# for building this table
if (is.null(denoms_by)) denoms_by <- c(treat_var, cols)
dist_grp <- denoms_by
which_is_treatvar <- which(
map_lgl(denoms_by, ~ as_name(.) %in% c(as_name(pop_treat_var), as_name(treat_var)))
)
if (length(which_is_treatvar) > 0) {
dist_grp <- dist_grp[-which_is_treatvar]
}
is_svar <- map_lgl(dist_grp, ~as_name(.) == "summary_var")
if (any(is_svar)) {
dist_grp[[which(is_svar)]] <- layer_params[[1]]
}

denoms_df_dist <- built_pop_data %>%
filter(!!denom_where) %>%
group_by(!!pop_treat_var) %>%
group_by(!!pop_treat_var, !!!dist_grp) %>%
summarize(
distinct_n = n_distinct(!!!distinct_by, !!pop_treat_var)
) %>%
ungroup()

by_join <- as_name(pop_treat_var)
names(by_join) <- as_name(treat_var)
# Create merge variables to get the denoms dataframe merged correctly
by_join <- map_chr(append(dist_grp, pop_treat_var, after=0), as_name)
names(by_join) <- map_chr(append(dist_grp, treat_var, after=0), as_name)


denoms_df <- denoms_df_n %>%
complete(!!!layer_params[param_apears],
fill = list(n = 0)) %>%
left_join(denoms_df_dist, by = by_join)

if (as_name(target_var[[1]]) %in% names(target)) {
denoms_df <- denoms_df %>%
rename("summary_var" := !!target_var[[1]])
}

rm(by_join, denoms_df_n, denoms_df_dist, dist_grp, is_svar, which_is_treatvar)

}, envir = x)

}
Expand Down Expand Up @@ -732,7 +783,7 @@ rename_missing_values <- function(x) {
# Replace the implicit values in built_target
built_target <- built_target %>%
mutate(!!target_var[[1]] := fct_expand(!!target_var[[1]], "(Missing)")) %>%
mutate(!!target_var[[1]] := fct_explicit_na(!!target_var[[1]]))
mutate(!!target_var[[1]] := fct_na_value_to_level(!!target_var[[1]], level="(Missing)"))

}
built_target <- built_target %>%
Expand All @@ -757,17 +808,17 @@ filter_numeric <- function(.data,
return(.data)
}

vals <- .data %>%
{if (is.null(numeric_cutoff_column)) . else filter(., !!treat_var == numeric_cutoff_column)} %>%
mutate(
pct = n/total,
distinct_pct = distinct_n/distinct_total
) %>%
filter(!!sym(numeric_cutoff_stat) >= !!numeric_cutoff) %>%
extract2("summary_var")

.data %>%
filter(summary_var %in% vals)
vals <- .data %>%
{if (is.null(numeric_cutoff_column)) . else filter(., !!treat_var == numeric_cutoff_column)} %>%
mutate(
pct = n/total,
distinct_pct = distinct_n/distinct_total
) %>%
filter(!!sym(numeric_cutoff_stat) >= !!numeric_cutoff) %>%
extract2("summary_var")

.data %>%
filter(summary_var %in% vals)


}
Expand Down
Loading

0 comments on commit 96911c4

Please sign in to comment.