Skip to content

Commit

Permalink
Tweak compactify+tolerance naming, defaults, tweak meaning
Browse files Browse the repository at this point in the history
- Standardize to names & defaults of `compactify = TRUE, compactify_abs_tol = 0`
  in most places. Add missing tolerance setting in `epix_merge()`. Keep
  `should_compactify` as-is in `revision_summary()` for now.
- Change `compactify = NULL` possibility to `compactify = "message"`, and
  message instead of warn.
- Make `compactify_abs_tol = 0` still compactify when exactly equal by using `<=
  compactify_abs_tol` rather than `<` via `dplyr::near()`.
- Update examples and vignettes to not have unnecessary `compactify = TRUE`.
  • Loading branch information
brookslogan committed Jan 24, 2025
1 parent a799625 commit 054c332
Show file tree
Hide file tree
Showing 16 changed files with 82 additions and 82 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ importFrom(dplyr,if_else)
importFrom(dplyr,is_grouped_df)
importFrom(dplyr,lag)
importFrom(dplyr,mutate)
importFrom(dplyr,near)
importFrom(dplyr,pick)
importFrom(dplyr,pull)
importFrom(dplyr,relocate)
Expand Down
60 changes: 33 additions & 27 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,15 @@ next_after.Date <- function(x) x + 1L
#' that should be considered key variables (in the language of `data.table`)
#' apart from "geo_value", "time_value", and "version". Typical examples
#' are "age" or more granular geographies.
#' @param compactify Optional; Boolean. `TRUE` will remove some
#' redundant rows, `FALSE` will not, and missing or `NULL` will remove
#' redundant rows, but issue a warning. See more information at `compactify`.
#' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will
#' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE`
#' but will emit a message if anything was changed. Default is `TRUE`. See
#' more information below under "Compactification:".
#' @param compactify_abs_tol Optional; double. A tolerance level used to detect
#' approximate equality for compactification. The default is 0, which
#' corresponds to exact equality. Consider using this if your value columns
#' undergo tiny nonmeaningful revisions and the archive object with the
#' default setting is too large.
#' @param clobberable_versions_start Optional; `length`-1; either a value of the
#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and
#' `typeof`: specifically, either (a) the earliest version that could be
Expand All @@ -224,8 +230,6 @@ next_after.Date <- function(x) x + 1L
#' value of `clobberable_versions_start` does not fully trust these empty
#' updates, and assumes that any version `>= max(x$version)` could be
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
#' @param compactify_tol double. the tolerance used to detect approximate
#' equality for compactification
#' @return An `epi_archive` object.
#'
#' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`]
Expand Down Expand Up @@ -391,41 +395,41 @@ validate_epi_archive <- function(x) {
#'
#' @keywords internal
#' @importFrom dplyr filter
apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
apply_compactify <- function(df, keys, abs_tol = 0) {
df %>%
arrange(!!!keys) %>%
filter(if_any(
c(everything(), -version), # all non-version columns
~ !is_locf(., tolerance)
~ !is_locf(., abs_tol)
))
}

#' get the entries that `compactify` would remove
#' @keywords internal
#' @importFrom dplyr filter if_all everything
removed_by_compactify <- function(df, keys, tolerance) {
removed_by_compactify <- function(df, keys, abs_tol) {
df %>%
arrange(!!!keys) %>%
filter(if_all(
c(everything(), -version),
~ is_locf(., tolerance)
~ is_locf(., abs_tol)
)) # nolint: object_usage_linter
}

#' Checks to see if a value in a vector is LOCF
#' @description
#' LOCF meaning last observation carried forward. lags the vector by 1, then
#' compares with itself. For doubles it uses float comparison via
#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
#' considered equal to themselves and each other.
#' @importFrom dplyr lag if_else near
#' compares with itself. For doubles it compares whether the absolute
#' difference is `<= abs_tol`; otherwise it uses equality. `NA`'s and `NaN`'s
#' are considered equal to themselves and each other.
#' @importFrom dplyr lag if_else
#' @keywords internal
is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
is_locf <- function(vec, abs_tol) { # nolint: object_usage_linter
lag_vec <- dplyr::lag(vec)
if (typeof(vec) == "double") {
res <- if_else(
!is.na(vec) & !is.na(lag_vec),
near(vec, lag_vec, tol = tolerance),
abs(vec - lag_vec) <= abs_tol,
is.na(vec) & is.na(lag_vec)
)
return(res)
Expand Down Expand Up @@ -456,8 +460,8 @@ as_epi_archive <- function(
geo_type = deprecated(),
time_type = deprecated(),
other_keys = character(),
compactify = NULL,
compactify_abs_tol = .Machine$double.eps^0.5,
compactify = TRUE,
compactify_abs_tol = 0,
clobberable_versions_start = NA,
.versions_end = max_version_with_row_in(x), ...,
versions_end = .versions_end) {
Expand All @@ -483,39 +487,41 @@ as_epi_archive <- function(
))

# Compactification:
assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE)
if (!list(compactify) %in% list(TRUE, FALSE, "message")) {
cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`')
}

data_table <- result$DT
key_vars <- key(data_table)

nrow_before_compactify <- nrow(data_table)
# Runs compactify on data frame
if (is.null(compactify) || compactify == TRUE) {
if (identical(compactify, TRUE) || identical(compactify, "message")) {
compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol)
} else {
compactified <- data_table
}
# Warns about redundant rows if the number of rows decreased, and we didn't
# Messages about redundant rows if the number of rows decreased, and we didn't
# explicitly say to compactify
if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
if (identical(compactify, "message") && nrow(compactified) < nrow_before_compactify) {
elim <- removed_by_compactify(data_table, key_vars, compactify_abs_tol)
warning_intro <- cli::format_inline(
message_intro <- cli::format_inline(
"Found rows that appear redundant based on
last (version of each) observation carried forward;
these rows have been removed to 'compactify' and save space:",
keep_whitespace = FALSE
)
warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L)))
warning_outro <- cli::format_inline(
message_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L)))
message_outro <- cli::format_inline(
"Built-in `epi_archive` functionality should be unaffected,
but results may change if you work directly with its fields (such as `DT`).
See `?as_epi_archive` for details.
To silence this warning but keep compactification,
To silence this message but keep compactification,
you can pass `compactify=TRUE` when constructing the archive.",
keep_whitespace = FALSE
)
warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro)
rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows")
message_string <- paste(sep = "\n", message_intro, message_data, message_outro)
rlang::inform(message_string, class = "epiprocess__compactify_default_removed_rows")
}

result$DT <- compactified
Expand Down
9 changes: 4 additions & 5 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@
#' # (a.k.a. "hotfixed", "clobbered", etc.):
#' clobberable_versions_start = max(archive_cases_dv_subset$DT$version),
#' # Suppose today is the following day, and there are no updates out yet:
#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L,
#' compactify = TRUE
#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L
#' )
#'
#' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version))
Expand Down Expand Up @@ -263,7 +262,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf"
#' and use `min(x$versions_end, y$versions_end)` as the result's
#' `versions_end`.
#'
#' @param compactify Optional; `TRUE` (default), `FALSE`, or `NULL`; should the
#' @param compactify Optional; `TRUE` (default), `FALSE`, or `"message"`; should the
#' result be compactified? See `as_epi_archive()` for details.
#' @details
#' When merging archives, unless the archives have identical data release
Expand Down Expand Up @@ -344,7 +343,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf"
#' @export
epix_merge <- function(x, y,
sync = c("forbid", "na", "locf", "truncate"),
compactify = TRUE) {
compactify = TRUE, compactify_abs_tol = 0) {
assert_class(x, "epi_archive")
assert_class(y, "epi_archive")
sync <- rlang::arg_match(sync)
Expand Down Expand Up @@ -527,7 +526,7 @@ epix_merge <- function(x, y,
# inputs are already compactified, but at time of writing we don't have
# compactify in its own method or field, and it seems like it should be
# pretty fast anyway.
compactify = compactify,
compactify = compactify, compactify_abs_tol = compactify_abs_tol,
clobberable_versions_start = result_clobberable_versions_start,
versions_end = new_versions_end
))
Expand Down
10 changes: 5 additions & 5 deletions R/revision_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@
#' @param rel_spread_threshold float between 0 and 1, for the printed summary,
#' the relative spread fraction used to characterize revisions which don't
#' actually change very much. Default is .1, or 10% of the final value
#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the
#' threshold for when two floats are considered identical.
#' @param should_compactify bool. Compactify if `TRUE`.
#' @param compactify_abs_tol float, used if `drop_nas=TRUE`, it determines the
#' threshold for when two floats are considered identical.
#'
#' @examples
#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli)
Expand All @@ -80,8 +80,8 @@ revision_summary <- function(epi_arch,
few_revisions = 3,
abs_spread_threshold = NULL,
rel_spread_threshold = 0.1,
compactify_tol = .Machine$double.eps^0.5,
should_compactify = TRUE) {
should_compactify = TRUE,
compactify_abs_tol = 0) {
arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
if (length(arg) == 0) {
# Choose the first column that's not a key or version
Expand Down Expand Up @@ -121,7 +121,7 @@ revision_summary <- function(epi_arch,
if (should_compactify) {
revision_behavior <- revision_behavior %>%
arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
apply_compactify(c(keys, version), compactify_tol)
apply_compactify(c(keys, version), compactify_abs_tol)
}
revision_behavior <-
revision_behavior %>%
Expand Down
2 changes: 1 addition & 1 deletion man/apply_compactify.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 12 additions & 8 deletions man/epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/epix_as_of.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/epix_merge.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/is_locf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/removed_by_compactify.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions man/revision_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 0 additions & 8 deletions tests/testthat/_snaps/archive.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,4 @@

Code
res <- dumb_ex %>% as_epi_archive()
Condition <epiprocess__compactify_default_removed_rows>
Warning:
Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space:
Key: <geo_value, time_value, version>
geo_value time_value value version
<char> <Date> <num> <Date>
1: ca 2020-01-01 1 2020-01-02
Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. To silence this warning but keep compactification, you can pass `compactify=TRUE` when constructing the archive.

8 changes: 4 additions & 4 deletions tests/testthat/test-compactify.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF

dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT)
dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT)
dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT))
dt_null <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT))

test_that("Warning for LOCF with compactify as NULL", {
expect_warning(as_epi_archive(dt, compactify = NULL))
test_that('Warning for LOCF with compactify as "message"', {
expect_message(as_epi_archive(dt, compactify = "message"))
})

test_that("No warning when there is no LOCF", {
expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA)
expect_no_message(as_epi_archive(dt[1:5], compactify = "message"))
})

test_that("LOCF values are ignored with compactify=FALSE", {
Expand Down
8 changes: 4 additions & 4 deletions vignettes/compactify.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ rows to remain.

There are three different values that can be assigned to `compactify`:

* No argument: if there are LOCF-redundant rows, removes them and issues a
warning with some information about what rows were removed
* `TRUE`: removes any LOCF-redundant rows without any warning or other feedback
* `FALSE`: keeps any LOCF-redundant rows without any warning or other feedback
* `TRUE` (default): removes any LOCF-redundant rows without any message or other feedback
* `FALSE`: keeps any LOCF-redundant rows without any message or other feedback
* `"message"`: if there are LOCF-redundant rows, removes them and produces a
message with some information about what rows were removed

For this example, we have one chart using LOCF values, while another doesn't
use them to illustrate LOCF. Notice how the head of the first dataset differs
Expand Down
Loading

0 comments on commit 054c332

Please sign in to comment.