Merge pull request #139 from atorus-research/gh_issues_136_138

Fix Issues #136 and #138
atorus-research · Nov 7, 2023 · 96911c4 · 96911c4
2 parents e037427 + c31a8e2
commit 96911c4
Show file tree

Hide file tree

Showing 19 changed files with 718 additions and 351 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,7 +46,7 @@ Imports:
     tidyselect (>= 1.1.0),
     tibble (>= 3.0.1),
     lifecycle,
-    forcats (>= 0.4.0)
+    forcats (>= 1.0.0)
 Suggests: 
     testthat (>= 2.1.0),
     haven (>= 2.2.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -135,6 +135,7 @@ importFrom(dplyr,between)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
+importFrom(dplyr,cur_column)
 importFrom(dplyr,cur_group)
 importFrom(dplyr,distinct)
 importFrom(dplyr,do)
@@ -146,6 +147,7 @@ importFrom(dplyr,group_keys)
 importFrom(dplyr,if_else)
 importFrom(dplyr,lag)
 importFrom(dplyr,left_join)
+importFrom(dplyr,matches)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_all)
 importFrom(dplyr,mutate_at)
@@ -164,7 +166,7 @@ importFrom(dplyr,vars)
 importFrom(forcats,fct_collapse)
 importFrom(forcats,fct_drop)
 importFrom(forcats,fct_expand)
-importFrom(forcats,fct_explicit_na)
+importFrom(forcats,fct_na_value_to_level)
 importFrom(lifecycle,deprecate_soft)
 importFrom(lifecycle,deprecate_stop)
 importFrom(magrittr,"%>%")
@@ -257,6 +259,7 @@ importFrom(tibble,add_column)
 importFrom(tibble,rownames_to_column)
 importFrom(tibble,tibble)
 importFrom(tidyr,complete)
+importFrom(tidyr,fill)
 importFrom(tidyr,nesting)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)

diff --git a/R/call_standardise.R b/R/call_standardise.R
@@ -0,0 +1,56 @@
+# # MIT License
+#
+# Copyright (c) 2020 rlang authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#' rlang deprecated call_standardise, but this is necessary for
+#' the `modify_nested_call()` function. The capability isn't
+#' matched by the `call_match()` function.
+#'
+#' In modify nested call, this is used to restructure the function call
+#' to have all named arguments. Since all the functions called inside
+#' `call_standardise()` are exported and currently stable within rlang,
+#' I'm porting the function here.
+#'
+#' @param call
+#'
+#' @return call
+#' @noRd
+tplyr_call_standardise <- function(call, env= rlang::caller_env()) {
+
+  expr <- rlang::get_expr(call)
+
+  if (!rlang::is_call(expr)) {
+    stop("call_standardise error")
+  }
+
+  # The call name might be a literal, not necessarily a symbol
+  env <- rlang::get_env(call, env)
+  fn <- rlang::eval_bare(rlang::node_car(expr), env)
+
+  if (rlang::is_primitive(fn)) {
+    call
+  } else {
+    matched <- match.call(fn, expr)
+    rlang::set_expr(call, matched)
+  }
+
+}
+
diff --git a/R/count.R b/R/count.R
@@ -48,7 +48,7 @@ process_summaries.count_layer <- function(x, ...) {
 
       ## Drop levels if target var is factor and kept levels used
       if (eval_tidy(keep_levels_logic) &&
-         is.factor(built_target[[as_name(tail(target_var, 1)[[1]])]])) {
+          is.factor(built_target[[as_name(tail(target_var, 1)[[1]])]])) {
         # Pull out the levels that weren't in keep levels.
         target_levels <- levels(built_target[[as_name(tail(target_var, 1)[[1]])]])
         drop_levels_ind <- !(target_levels %in% levels_to_keep)
@@ -72,7 +72,7 @@ process_summaries.count_layer <- function(x, ...) {
     }
 
     if (length(target_var) == 2 && !quo_is_symbol(target_var[[2]]) &&
-                as_name(target_var[[2]]) %in% names(target)) {
+        as_name(target_var[[2]]) %in% names(target)) {
       warning(paste0("The second target variable has been coerced into a symbol.",
                      "You should pass variable names unquoted."), immediate. = TRUE)
 
@@ -139,27 +139,58 @@ process_single_count_target <- function(x) {
       # Used to temporarily check formats
       if (is.null(format_strings)) tmp_fmt <- gather_defaults.count_layer(current_env())
       if (count_missings && !(is.null(denom_ignore) || length(denom_ignore) == 0) &&
-         (("pct" %in% total_count_format$vars || "distinct_pct" %in% total_count_format$vars) ||
-         # Logic if no total_count format
-         (is.null(total_count_format) && is.null(format_strings) && ("pct" %in% tmp_fmt$n_counts$vars || "distinct_pct" %in% tmp_fmt$n_counts$vars)) ||
-         (is.null(total_count_format) && ("pct" %in% count_fmt$n_counts$vars || "distinct_pct" %in% count_fmt$n_counts$vars))
-         )
-         ) {
-           warning("Your total row is ignoring certain values. The 'pct' in this row may not be 100%",
-                   immediate. = TRUE)
-         }
+          (("pct" %in% total_count_format$vars || "distinct_pct" %in% total_count_format$vars) ||
+           # Logic if no total_count format
+           (is.null(total_count_format) && is.null(format_strings) && ("pct" %in% tmp_fmt$n_counts$vars || "distinct_pct" %in% tmp_fmt$n_counts$vars)) ||
+           (is.null(total_count_format) && ("pct" %in% count_fmt$n_counts$vars || "distinct_pct" %in% count_fmt$n_counts$vars))
+          )
+      ) {
+        warning("Your total row is ignoring certain values. The 'pct' in this row may not be 100%",
+                immediate. = TRUE)
+      }
     }
 
     if (is.null(count_row_prefix)) count_row_prefix <- ""
 
-    if (is.null(denoms_by)) denoms_by <- c(treat_var, cols)
+    # If a denoms variable is factor then it should be character for the denoms calculations
+    denoms_df_prep <- denoms_df %>%
+      mutate(
+        across(dplyr::where(is.factor), ~as.character(.))
+      )
+
+    # Nested counts might have summary var come through as numeric
+    if ('summary_var' %in% map_chr(denoms_by, as_name) && is.numeric(denoms_df_prep$summary_var)) {
+      denoms_df_prep$summary_var <- as.character(denoms_df_prep$summary_var)
+    }
+
+    # But if a summary_stat variable is factor, then the denoms needs to match this
+    # This happens if sorting was triggered for the variable as a factor
+    # fct_cols will be a named logical vector of the variable names, where TRUE
+    # is the summary_stat variables that are factors
+    fct_cols <- map_lgl(summary_stat, is.factor)
+
+    if (any(fct_cols)) {
+      # From the bool vector of fct_cols, grab the names of the ones that
+      # are TRUE
+      # Create a regular expression like var1|var2|var3
+      fct_cols_ns <- paste(names(fct_cols[fct_cols]), collapse="|")
+
+      # Reset each factor variable to have the appropriate levels for the denom
+      # so that 0 filling can happen appropriately later on
+      denoms_df_prep <- denoms_df_prep %>%
+        mutate(
+          across(matches(fct_cols_ns), ~ factor(., levels=levels(summary_stat[[cur_column()]])))
+        )
+
+      rm(fct_cols_ns)
+    }
 
     # rbind tables together
     numeric_data <- summary_stat %>%
       bind_rows(total_stat) %>%
       rename("summary_var" = !!target_var[[1]]) %>%
       group_by(!!!denoms_by) %>%
-      do(get_denom_total(., denoms_by, denoms_df, "n")) %>%
+      do(get_denom_total(., denoms_by, denoms_df_prep, "n")) %>%
       mutate(summary_var = prefix_count_row(summary_var, count_row_prefix)) %>%
       ungroup()
 
@@ -175,6 +206,8 @@ process_single_count_target <- function(x) {
                                 distinct_stat[, c("distinct_n", "distinct_total")])
     }
 
+    rm(denoms_df_prep, fct_cols)
+
   }, envir = x)
 }
 
@@ -214,8 +247,8 @@ process_count_n <- function(x) {
     # If there is a missing_count_string, but its not in the dataset
     if (!is.null(missing_count_string) &&
 
-       !((any(unname(unlist(missing_count_list)) %in% unique(built_target[, as_name(target_var[[1]])]))) ||
-        any(is.na(built_target[, as_name(target_var[[1]])])))) {
+        !((any(unname(unlist(missing_count_list)) %in% unique(built_target[, as_name(target_var[[1]])]))) ||
+          any(is.na(built_target[, as_name(target_var[[1]])])))) {
       # This adds the missing string as a factor to the tallies. This is needed
       # to make sure the missing row is added even if there are no missing values.
       summary_stat <- summary_stat %>%
@@ -318,12 +351,12 @@ prepare_format_metadata.count_layer <- function(x) {
     # distinct_by
     # If both distinct and n
     if (((("distinct_n" %in% map(format_strings$n_counts$vars, as_name) &
-         "n" %in% map(format_strings$n_counts$vars, as_name)) |
-        # or both distinct_pct and pct
-        ("distinct_pct" %in% map(format_strings$n_counts$vars, as_name) &
-         "pct" %in% map(format_strings$n_counts$vars, as_name))) &
-        # AND distinct_by is null
-        is.null(distinct_by))) {
+           "n" %in% map(format_strings$n_counts$vars, as_name)) |
+          # or both distinct_pct and pct
+          ("distinct_pct" %in% map(format_strings$n_counts$vars, as_name) &
+           "pct" %in% map(format_strings$n_counts$vars, as_name))) &
+         # AND distinct_by is null
+         is.null(distinct_by))) {
       stop("You can't use distinct and non-distinct parameters without specifying a distinct_by")
     }
 
@@ -389,11 +422,11 @@ process_formatting.count_layer <- function(x, ...) {
                                has_missing_count = has_missing_count)
       }) %>%
       # Pivot table
-        pivot_wider(id_cols = c(match_exact(by), "summary_var"),
+      pivot_wider(id_cols = c(match_exact(by), "summary_var"),
                   names_from = c(!!treat_var, match_exact(cols)), values_from = n,
                   names_prefix = "var1_") %>%
-    # Replace the by variables and target variable names with `row_label<n>`
-    replace_by_string_names(quos(!!!by, summary_var))
+      # Replace the by variables and target variable names with `row_label<n>`
+      replace_by_string_names(quos(!!!by, summary_var))
 
     if (is_built_nest) {
       # I had trouble doing this in a 'tidy' way so I just did it here.
@@ -491,7 +524,7 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
   # Iterate over every variable
   for (i in seq_along(vars_ord)) {
     str_all[[i + 1]] <-  count_string_switch_help(vars_ord[i], count_fmt, .n[!missing_rows & !total_rows], .total[!missing_rows & !total_rows],
-                                                .distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord)
+                                                  .distinct_n[!missing_rows & !total_rows], .distinct_total[!missing_rows & !total_rows], vars_ord)
   }
 
 
@@ -501,12 +534,12 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
   missing_str_all[1] <- missing_f_str$repl_str
   for (i in seq_along(missing_vars_ord)) {
     missing_str_all[[i + 1]] <- count_string_switch_help(missing_vars_ord[i],
-                                                       missing_f_str,
-                                                       .n[missing_rows],
-                                                       .total[missing_rows],
-                                                       .distinct_n[missing_rows],
-                                                       .distinct_total[missing_rows],
-                                                       missing_vars_ord)
+                                                         missing_f_str,
+                                                         .n[missing_rows],
+                                                         .total[missing_rows],
+                                                         .distinct_n[missing_rows],
+                                                         .distinct_total[missing_rows],
+                                                         missing_vars_ord)
   }
 
   total_str_all <- vector("list", 5)
@@ -516,9 +549,9 @@ construct_count_string <- function(.n, .total, .distinct_n = NULL, .distinct_tot
                                                        total_count_format,
                                                        .n[total_rows],
                                                        .total[total_rows],
-                                                     .distinct_n[total_rows],
-                                                     .distinct_total[total_rows],
-                                                     total_vars_ord)
+                                                       .distinct_n[total_rows],
+                                                       .distinct_total[total_rows],
+                                                       total_vars_ord)
   }
 
   # Put the vector strings together. Only include parts of str_all that aren't null
@@ -658,8 +691,8 @@ process_count_denoms <- function(x) {
                    "` is invalid. Filter error:\n", e))
     })
 
-      # For distinct counts, we want to defer back to the
-      # population dataset. Trigger this by identifying that
+    # For distinct counts, we want to defer back to the
+    # population dataset. Trigger this by identifying that
     # the population dataset was overridden
     if (!isTRUE(try(identical(pop_data, target)))) {
       # If the denom_where doesn't match the where AND the denom_where isn't true
@@ -675,30 +708,48 @@ process_count_denoms <- function(x) {
       group_by(!!!layer_params[param_apears]) %>%
       summarize(
         n = n()
-        ) %>%
+      ) %>%
       ungroup()
 
+    # If user specified treatment var as a denom by then remove it
+    # and if inside a nested layer, rename summary_var in the denoms_by
+    # for building this table
+    if (is.null(denoms_by)) denoms_by <- c(treat_var, cols)
+    dist_grp <- denoms_by
+    which_is_treatvar <- which(
+      map_lgl(denoms_by, ~ as_name(.) %in% c(as_name(pop_treat_var), as_name(treat_var)))
+    )
+    if (length(which_is_treatvar) > 0) {
+      dist_grp <- dist_grp[-which_is_treatvar]
+    }
+    is_svar <- map_lgl(dist_grp, ~as_name(.) == "summary_var")
+    if (any(is_svar)) {
+      dist_grp[[which(is_svar)]] <- layer_params[[1]]
+    }
+
     denoms_df_dist <- built_pop_data %>%
       filter(!!denom_where) %>%
-      group_by(!!pop_treat_var) %>%
+      group_by(!!pop_treat_var, !!!dist_grp) %>%
       summarize(
         distinct_n = n_distinct(!!!distinct_by, !!pop_treat_var)
       ) %>%
       ungroup()
 
-    by_join <- as_name(pop_treat_var)
-    names(by_join) <- as_name(treat_var)
+    # Create merge variables to get the denoms dataframe merged correctly
+    by_join <- map_chr(append(dist_grp, pop_treat_var, after=0), as_name)
+    names(by_join) <- map_chr(append(dist_grp, treat_var, after=0), as_name)
+
 
     denoms_df <- denoms_df_n %>%
-      complete(!!!layer_params[param_apears],
-               fill = list(n = 0)) %>%
       left_join(denoms_df_dist, by = by_join)
 
     if (as_name(target_var[[1]]) %in% names(target)) {
       denoms_df <- denoms_df %>%
         rename("summary_var" := !!target_var[[1]])
     }
 
+    rm(by_join, denoms_df_n, denoms_df_dist, dist_grp, is_svar, which_is_treatvar)
+
   }, envir = x)
 
 }
@@ -732,7 +783,7 @@ rename_missing_values <- function(x) {
           # Replace the implicit values in built_target
           built_target <- built_target %>%
             mutate(!!target_var[[1]] := fct_expand(!!target_var[[1]], "(Missing)")) %>%
-            mutate(!!target_var[[1]] := fct_explicit_na(!!target_var[[1]]))
+            mutate(!!target_var[[1]] := fct_na_value_to_level(!!target_var[[1]], level="(Missing)"))
 
         }
         built_target <- built_target %>%
@@ -757,17 +808,17 @@ filter_numeric <- function(.data,
     return(.data)
   }
 
-    vals <- .data %>%
-      {if (is.null(numeric_cutoff_column)) . else filter(., !!treat_var == numeric_cutoff_column)} %>%
-      mutate(
-        pct = n/total,
-        distinct_pct = distinct_n/distinct_total
-      ) %>%
-      filter(!!sym(numeric_cutoff_stat) >= !!numeric_cutoff) %>%
-      extract2("summary_var")
-
-    .data %>%
-      filter(summary_var %in% vals)
+  vals <- .data %>%
+    {if (is.null(numeric_cutoff_column)) . else filter(., !!treat_var == numeric_cutoff_column)} %>%
+    mutate(
+      pct = n/total,
+      distinct_pct = distinct_n/distinct_total
+    ) %>%
+    filter(!!sym(numeric_cutoff_stat) >= !!numeric_cutoff) %>%
+    extract2("summary_var")
+
+  .data %>%
+    filter(summary_var %in% vals)
 
 
 }