Tweak compactify+tolerance naming, defaults, tweak meaning

- Standardize to names & defaults of `compactify = TRUE, compactify_abs_tol = 0` in most places. Add missing tolerance setting in `epix_merge()`. Keep `should_compactify` as-is in `revision_summary()` for now. - Change `compactify = NULL` possibility to `compactify = "message"`, and message instead of warn. - Make `compactify_abs_tol = 0` still compactify when exactly equal by using `<= compactify_abs_tol` rather than `<` via `dplyr::near()`. - Update examples and vignettes to not have unnecessary `compactify = TRUE`.
cmu-delphi · Jan 24, 2025 · 054c332 · 054c332
1 parent a799625
commit 054c332
Show file tree

Hide file tree

Showing 16 changed files with 82 additions and 82 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -165,7 +165,6 @@ importFrom(dplyr,if_else)
 importFrom(dplyr,is_grouped_df)
 importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
-importFrom(dplyr,near)
 importFrom(dplyr,pick)
 importFrom(dplyr,pull)
 importFrom(dplyr,relocate)

diff --git a/R/archive.R b/R/archive.R
@@ -197,9 +197,15 @@ next_after.Date <- function(x) x + 1L
 #'   that should be considered key variables (in the language of `data.table`)
 #'   apart from "geo_value", "time_value", and "version". Typical examples
 #'   are "age" or more granular geographies.
-#' @param compactify Optional; Boolean. `TRUE` will remove some
-#'   redundant rows, `FALSE` will not, and missing or `NULL` will remove
-#'   redundant rows, but issue a warning. See more information at `compactify`.
+#' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will
+#'   remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE`
+#'   but will emit a message if anything was changed. Default is `TRUE`. See
+#'   more information below under "Compactification:".
+#' @param compactify_abs_tol Optional; double. A tolerance level used to detect
+#'   approximate equality for compactification. The default is 0, which
+#'   corresponds to exact equality. Consider using this if your value columns
+#'   undergo tiny nonmeaningful revisions and the archive object with the
+#'   default setting is too large.
 #' @param clobberable_versions_start Optional; `length`-1; either a value of the
 #'   same `class` and `typeof` as `x$version`, or an `NA` of any `class` and
 #'   `typeof`: specifically, either (a) the earliest version that could be
@@ -224,8 +230,6 @@ next_after.Date <- function(x) x + 1L
 #'   value of `clobberable_versions_start` does not fully trust these empty
 #'   updates, and assumes that any version `>= max(x$version)` could be
 #'   clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
-#' @param compactify_tol double. the tolerance used to detect approximate
-#'   equality for compactification
 #' @return An `epi_archive` object.
 #'
 #' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`]
@@ -391,41 +395,41 @@ validate_epi_archive <- function(x) {
 #'
 #' @keywords internal
 #' @importFrom dplyr filter
-apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
+apply_compactify <- function(df, keys, abs_tol = 0) {
   df %>%
     arrange(!!!keys) %>%
     filter(if_any(
       c(everything(), -version), # all non-version columns
-      ~ !is_locf(., tolerance)
+      ~ !is_locf(., abs_tol)
     ))
 }
 
 #' get the entries that `compactify` would remove
 #' @keywords internal
 #' @importFrom dplyr filter if_all everything
-removed_by_compactify <- function(df, keys, tolerance) {
+removed_by_compactify <- function(df, keys, abs_tol) {
   df %>%
     arrange(!!!keys) %>%
     filter(if_all(
       c(everything(), -version),
-      ~ is_locf(., tolerance)
+      ~ is_locf(., abs_tol)
     )) # nolint: object_usage_linter
 }
 
 #' Checks to see if a value in a vector is LOCF
 #' @description
 #' LOCF meaning last observation carried forward. lags the vector by 1, then
-#'   compares with itself. For doubles it uses float comparison via
-#'   [`dplyr::near`], otherwise it uses equality.  `NA`'s and `NaN`'s are
-#'   considered equal to themselves and each other.
-#' @importFrom dplyr lag if_else near
+#'   compares with itself. For doubles it compares whether the absolute
+#'   difference is `<= abs_tol`; otherwise it uses equality. `NA`'s and `NaN`'s
+#'   are considered equal to themselves and each other.
+#' @importFrom dplyr lag if_else
 #' @keywords internal
-is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
+is_locf <- function(vec, abs_tol) { # nolint: object_usage_linter
   lag_vec <- dplyr::lag(vec)
   if (typeof(vec) == "double") {
     res <- if_else(
       !is.na(vec) & !is.na(lag_vec),
-      near(vec, lag_vec, tol = tolerance),
+      abs(vec - lag_vec) <= abs_tol,
       is.na(vec) & is.na(lag_vec)
     )
     return(res)
@@ -456,8 +460,8 @@ as_epi_archive <- function(
     geo_type = deprecated(),
     time_type = deprecated(),
     other_keys = character(),
-    compactify = NULL,
-    compactify_abs_tol = .Machine$double.eps^0.5,
+    compactify = TRUE,
+    compactify_abs_tol = 0,
     clobberable_versions_start = NA,
     .versions_end = max_version_with_row_in(x), ...,
     versions_end = .versions_end) {
@@ -483,39 +487,41 @@ as_epi_archive <- function(
   ))
 
   # Compactification:
-  assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE)
+  if (!list(compactify) %in% list(TRUE, FALSE, "message")) {
+    cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`')
+  }
 
   data_table <- result$DT
   key_vars <- key(data_table)
 
   nrow_before_compactify <- nrow(data_table)
   # Runs compactify on data frame
-  if (is.null(compactify) || compactify == TRUE) {
+  if (identical(compactify, TRUE) || identical(compactify, "message")) {
     compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol)
   } else {
     compactified <- data_table
   }
-  # Warns about redundant rows if the number of rows decreased, and we didn't
+  # Messages about redundant rows if the number of rows decreased, and we didn't
   # explicitly say to compactify
-  if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
+  if (identical(compactify, "message") && nrow(compactified) < nrow_before_compactify) {
     elim <- removed_by_compactify(data_table, key_vars, compactify_abs_tol)
-    warning_intro <- cli::format_inline(
+    message_intro <- cli::format_inline(
       "Found rows that appear redundant based on
        last (version of each) observation carried forward;
        these rows have been removed to 'compactify' and save space:",
       keep_whitespace = FALSE
     )
-    warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L)))
-    warning_outro <- cli::format_inline(
+    message_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L)))
+    message_outro <- cli::format_inline(
       "Built-in `epi_archive` functionality should be unaffected,
        but results may change if you work directly with its fields (such as `DT`).
        See `?as_epi_archive` for details.
-       To silence this warning but keep compactification,
+       To silence this message but keep compactification,
        you can pass `compactify=TRUE` when constructing the archive.",
       keep_whitespace = FALSE
     )
-    warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro)
-    rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows")
+    message_string <- paste(sep = "\n", message_intro, message_data, message_outro)
+    rlang::inform(message_string, class = "epiprocess__compactify_default_removed_rows")
   }
 
   result$DT <- compactified

diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -52,8 +52,7 @@
 #'   # (a.k.a. "hotfixed", "clobbered", etc.):
 #'   clobberable_versions_start = max(archive_cases_dv_subset$DT$version),
 #'   # Suppose today is the following day, and there are no updates out yet:
-#'   versions_end = max(archive_cases_dv_subset$DT$version) + 1L,
-#'   compactify = TRUE
+#'   versions_end = max(archive_cases_dv_subset$DT$version) + 1L
 #' )
 #'
 #' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version))
@@ -263,7 +262,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf"
 #'   and use `min(x$versions_end, y$versions_end)` as the result's
 #'   `versions_end`.
 #'
-#' @param compactify Optional; `TRUE` (default), `FALSE`, or `NULL`; should the
+#' @param compactify Optional; `TRUE` (default), `FALSE`, or `"message"`; should the
 #'   result be compactified? See `as_epi_archive()` for details.
 #' @details
 #' When merging archives, unless the archives have identical data release
@@ -344,7 +343,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf"
 #' @export
 epix_merge <- function(x, y,
                        sync = c("forbid", "na", "locf", "truncate"),
-                       compactify = TRUE) {
+                       compactify = TRUE, compactify_abs_tol = 0) {
   assert_class(x, "epi_archive")
   assert_class(y, "epi_archive")
   sync <- rlang::arg_match(sync)
@@ -527,7 +526,7 @@ epix_merge <- function(x, y,
     # inputs are already compactified, but at time of writing we don't have
     # compactify in its own method or field, and it seems like it should be
     # pretty fast anyway.
-    compactify = compactify,
+    compactify = compactify, compactify_abs_tol = compactify_abs_tol,
     clobberable_versions_start = result_clobberable_versions_start,
     versions_end = new_versions_end
   ))

diff --git a/R/revision_analysis.R b/R/revision_analysis.R
@@ -57,9 +57,9 @@
 #' @param rel_spread_threshold float between 0 and 1, for the printed summary,
 #'   the relative spread fraction used to characterize revisions which don't
 #'   actually change very much. Default is .1, or 10% of the final value
-#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the
-#'   threshold for when two floats are considered identical.
 #' @param should_compactify bool. Compactify if `TRUE`.
+#' @param compactify_abs_tol float, used if `drop_nas=TRUE`, it determines the
+#'   threshold for when two floats are considered identical.
 #'
 #' @examples
 #' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli)
@@ -80,8 +80,8 @@ revision_summary <- function(epi_arch,
                              few_revisions = 3,
                              abs_spread_threshold = NULL,
                              rel_spread_threshold = 0.1,
-                             compactify_tol = .Machine$double.eps^0.5,
-                             should_compactify = TRUE) {
+                             should_compactify = TRUE,
+                             compactify_abs_tol = 0) {
   arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
   if (length(arg) == 0) {
     # Choose the first column that's not a key or version
@@ -121,7 +121,7 @@ revision_summary <- function(epi_arch,
   if (should_compactify) {
     revision_behavior <- revision_behavior %>%
       arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
-      apply_compactify(c(keys, version), compactify_tol)
+      apply_compactify(c(keys, version), compactify_abs_tol)
   }
   revision_behavior <-
     revision_behavior %>%

diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd
diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd
diff --git a/man/is_locf.Rd b/man/is_locf.Rd
diff --git a/man/removed_by_compactify.Rd b/man/removed_by_compactify.Rd
diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd
diff --git a/tests/testthat/_snaps/archive.md b/tests/testthat/_snaps/archive.md
@@ -2,12 +2,4 @@
 
     Code
       res <- dumb_ex %>% as_epi_archive()
-    Condition <epiprocess__compactify_default_removed_rows>
-      Warning:
-      Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space:
-      Key: <geo_value, time_value, version>
-         geo_value time_value value    version
-            <char>     <Date> <num>     <Date>
-      1:        ca 2020-01-01     1 2020-01-02
-      Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. To silence this warning but keep compactification, you can pass `compactify=TRUE` when constructing the archive.
 
diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R
@@ -52,14 +52,14 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF
 
 dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT)
 dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT)
-dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT))
+dt_null <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT))
 
-test_that("Warning for LOCF with compactify as NULL", {
-  expect_warning(as_epi_archive(dt, compactify = NULL))
+test_that('Warning for LOCF with compactify as "message"', {
+  expect_message(as_epi_archive(dt, compactify = "message"))
 })
 
 test_that("No warning when there is no LOCF", {
-  expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA)
+  expect_no_message(as_epi_archive(dt[1:5], compactify = "message"))
 })
 
 test_that("LOCF values are ignored with compactify=FALSE", {

diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd
@@ -23,10 +23,10 @@ rows to remain.
 
 There are three different values that can be assigned to `compactify`:
 
-* No argument: if there are LOCF-redundant rows, removes them and issues a
-  warning with some information about what rows were removed
-* `TRUE`: removes any LOCF-redundant rows without any warning or other feedback
-* `FALSE`: keeps any LOCF-redundant rows without any warning or other feedback
+* `TRUE` (default): removes any LOCF-redundant rows without any message or other feedback
+* `FALSE`: keeps any LOCF-redundant rows without any message or other feedback
+* `"message"`: if there are LOCF-redundant rows, removes them and produces a
+  message with some information about what rows were removed
 
 For this example, we have one chart using LOCF values, while another doesn't
 use them to illustrate LOCF. Notice how the head of the first dataset differs