diff --git a/NAMESPACE b/NAMESPACE index 804cbeaa..3d34bcdf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -165,7 +165,6 @@ importFrom(dplyr,if_else) importFrom(dplyr,is_grouped_df) importFrom(dplyr,lag) importFrom(dplyr,mutate) -importFrom(dplyr,near) importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,relocate) diff --git a/R/archive.R b/R/archive.R index 19bc4269..ad2c5de5 100644 --- a/R/archive.R +++ b/R/archive.R @@ -197,9 +197,15 @@ next_after.Date <- function(x) x + 1L #' that should be considered key variables (in the language of `data.table`) #' apart from "geo_value", "time_value", and "version". Typical examples #' are "age" or more granular geographies. -#' @param compactify Optional; Boolean. `TRUE` will remove some -#' redundant rows, `FALSE` will not, and missing or `NULL` will remove -#' redundant rows, but issue a warning. See more information at `compactify`. +#' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will +#' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE` +#' but will emit a message if anything was changed. Default is `TRUE`. See +#' more information below under "Compactification:". +#' @param compactify_abs_tol Optional; double. A tolerance level used to detect +#' approximate equality for compactification. The default is 0, which +#' corresponds to exact equality. Consider using this if your value columns +#' undergo tiny nonmeaningful revisions and the archive object with the +#' default setting is too large. #' @param clobberable_versions_start Optional; `length`-1; either a value of the #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be @@ -224,8 +230,6 @@ next_after.Date <- function(x) x + 1L #' value of `clobberable_versions_start` does not fully trust these empty #' updates, and assumes that any version `>= max(x$version)` could be #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @param compactify_tol double. the tolerance used to detect approximate -#' equality for compactification #' @return An `epi_archive` object. #' #' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`] @@ -391,41 +395,41 @@ validate_epi_archive <- function(x) { #' #' @keywords internal #' @importFrom dplyr filter -apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) { +apply_compactify <- function(df, keys, abs_tol = 0) { df %>% arrange(!!!keys) %>% filter(if_any( c(everything(), -version), # all non-version columns - ~ !is_locf(., tolerance) + ~ !is_locf(., abs_tol) )) } #' get the entries that `compactify` would remove #' @keywords internal #' @importFrom dplyr filter if_all everything -removed_by_compactify <- function(df, keys, tolerance) { +removed_by_compactify <- function(df, keys, abs_tol) { df %>% arrange(!!!keys) %>% filter(if_all( c(everything(), -version), - ~ is_locf(., tolerance) + ~ is_locf(., abs_tol) )) # nolint: object_usage_linter } #' Checks to see if a value in a vector is LOCF #' @description #' LOCF meaning last observation carried forward. lags the vector by 1, then -#' compares with itself. For doubles it uses float comparison via -#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are -#' considered equal to themselves and each other. -#' @importFrom dplyr lag if_else near +#' compares with itself. For doubles it compares whether the absolute +#' difference is `<= abs_tol`; otherwise it uses equality. `NA`'s and `NaN`'s +#' are considered equal to themselves and each other. +#' @importFrom dplyr lag if_else #' @keywords internal -is_locf <- function(vec, tolerance) { # nolint: object_usage_linter +is_locf <- function(vec, abs_tol) { # nolint: object_usage_linter lag_vec <- dplyr::lag(vec) if (typeof(vec) == "double") { res <- if_else( !is.na(vec) & !is.na(lag_vec), - near(vec, lag_vec, tol = tolerance), + abs(vec - lag_vec) <= abs_tol, is.na(vec) & is.na(lag_vec) ) return(res) @@ -456,8 +460,8 @@ as_epi_archive <- function( geo_type = deprecated(), time_type = deprecated(), other_keys = character(), - compactify = NULL, - compactify_abs_tol = .Machine$double.eps^0.5, + compactify = TRUE, + compactify_abs_tol = 0, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { @@ -483,39 +487,41 @@ as_epi_archive <- function( )) # Compactification: - assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) + if (!list(compactify) %in% list(TRUE, FALSE, "message")) { + cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`') + } data_table <- result$DT key_vars <- key(data_table) nrow_before_compactify <- nrow(data_table) # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { + if (identical(compactify, TRUE) || identical(compactify, "message")) { compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol) } else { compactified <- data_table } - # Warns about redundant rows if the number of rows decreased, and we didn't + # Messages about redundant rows if the number of rows decreased, and we didn't # explicitly say to compactify - if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) { + if (identical(compactify, "message") && nrow(compactified) < nrow_before_compactify) { elim <- removed_by_compactify(data_table, key_vars, compactify_abs_tol) - warning_intro <- cli::format_inline( + message_intro <- cli::format_inline( "Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space:", keep_whitespace = FALSE ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( + message_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + message_outro <- cli::format_inline( "Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. - To silence this warning but keep compactification, + To silence this message but keep compactification, you can pass `compactify=TRUE` when constructing the archive.", keep_whitespace = FALSE ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + message_string <- paste(sep = "\n", message_intro, message_data, message_outro) + rlang::inform(message_string, class = "epiprocess__compactify_default_removed_rows") } result$DT <- compactified diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 9ad45673..2432cc82 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -52,8 +52,7 @@ #' # (a.k.a. "hotfixed", "clobbered", etc.): #' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), #' # Suppose today is the following day, and there are no updates out yet: -#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L, -#' compactify = TRUE +#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L #' ) #' #' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) @@ -263,7 +262,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf" #' and use `min(x$versions_end, y$versions_end)` as the result's #' `versions_end`. #' -#' @param compactify Optional; `TRUE` (default), `FALSE`, or `NULL`; should the +#' @param compactify Optional; `TRUE` (default), `FALSE`, or `"message"`; should the #' result be compactified? See `as_epi_archive()` for details. #' @details #' When merging archives, unless the archives have identical data release @@ -344,7 +343,7 @@ epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf" #' @export epix_merge <- function(x, y, sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { + compactify = TRUE, compactify_abs_tol = 0) { assert_class(x, "epi_archive") assert_class(y, "epi_archive") sync <- rlang::arg_match(sync) @@ -527,7 +526,7 @@ epix_merge <- function(x, y, # inputs are already compactified, but at time of writing we don't have # compactify in its own method or field, and it seems like it should be # pretty fast anyway. - compactify = compactify, + compactify = compactify, compactify_abs_tol = compactify_abs_tol, clobberable_versions_start = result_clobberable_versions_start, versions_end = new_versions_end )) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 27944489..1bceb0b8 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -57,9 +57,9 @@ #' @param rel_spread_threshold float between 0 and 1, for the printed summary, #' the relative spread fraction used to characterize revisions which don't #' actually change very much. Default is .1, or 10% of the final value -#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the -#' threshold for when two floats are considered identical. #' @param should_compactify bool. Compactify if `TRUE`. +#' @param compactify_abs_tol float, used if `drop_nas=TRUE`, it determines the +#' threshold for when two floats are considered identical. #' #' @examples #' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) @@ -80,8 +80,8 @@ revision_summary <- function(epi_arch, few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, - compactify_tol = .Machine$double.eps^0.5, - should_compactify = TRUE) { + should_compactify = TRUE, + compactify_abs_tol = 0) { arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) if (length(arg) == 0) { # Choose the first column that's not a key or version @@ -121,7 +121,7 @@ revision_summary <- function(epi_arch, if (should_compactify) { revision_behavior <- revision_behavior %>% arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying - apply_compactify(c(keys, version), compactify_tol) + apply_compactify(c(keys, version), compactify_abs_tol) } revision_behavior <- revision_behavior %>% diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd index 0e1f0b3c..7426dd7a 100644 --- a/man/apply_compactify.Rd +++ b/man/apply_compactify.Rd @@ -4,7 +4,7 @@ \alias{apply_compactify} \title{Given a tibble as would be found in an epi_archive, remove duplicate entries.} \usage{ -apply_compactify(df, keys, tolerance = .Machine$double.eps^0.5) +apply_compactify(df, keys, abs_tol = 0) } \description{ Works by shifting all rows except the version, then comparing values to see diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 28d87c3d..f98666e0 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -23,8 +23,8 @@ as_epi_archive( geo_type = deprecated(), time_type = deprecated(), other_keys = character(), - compactify = NULL, - compactify_abs_tol = .Machine$double.eps^0.5, + compactify = TRUE, + compactify_abs_tol = 0, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), ..., @@ -73,9 +73,16 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{compactify}{Optional; Boolean. \code{TRUE} will remove some -redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove -redundant rows, but issue a warning. See more information at \code{compactify}.} +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will +remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} +but will emit a message if anything was changed. Default is \code{TRUE}. See +more information below under "Compactification:".} + +\item{compactify_abs_tol}{Optional; double. A tolerance level used to detect +approximate equality for compactification. The default is 0, which +corresponds to exact equality. Consider using this if your value columns +undergo tiny nonmeaningful revisions and the archive object with the +default setting is too large.} \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being @@ -83,9 +90,6 @@ used to rename columns.} \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} - -\item{compactify_tol}{double. the tolerance used to detect approximate -equality for compactification} } \value{ An \code{epi_archive} object. diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index c3682489..be5c977a 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -71,8 +71,7 @@ archive_cases_dv_subset2 <- as_epi_archive( # (a.k.a. "hotfixed", "clobbered", etc.): clobberable_versions_start = max(archive_cases_dv_subset$DT$version), # Suppose today is the following day, and there are no updates out yet: - versions_end = max(archive_cases_dv_subset$DT$version) + 1L, - compactify = TRUE + versions_end = max(archive_cases_dv_subset$DT$version) + 1L ) epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 3ffebc99..af20e228 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -8,7 +8,8 @@ epix_merge( x, y, sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE + compactify = TRUE, + compactify_abs_tol = 0 ) } \arguments{ @@ -33,7 +34,7 @@ and use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}. }} -\item{compactify}{Optional; \code{TRUE} (default), \code{FALSE}, or \code{NULL}; should the +\item{compactify}{Optional; \code{TRUE} (default), \code{FALSE}, or \code{"message"}; should the result be compactified? See \code{as_epi_archive()} for details.} } \value{ diff --git a/man/is_locf.Rd b/man/is_locf.Rd index 8efeecfd..8755d204 100644 --- a/man/is_locf.Rd +++ b/man/is_locf.Rd @@ -4,12 +4,12 @@ \alias{is_locf} \title{Checks to see if a value in a vector is LOCF} \usage{ -is_locf(vec, tolerance) +is_locf(vec, abs_tol) } \description{ LOCF meaning last observation carried forward. lags the vector by 1, then -compares with itself. For doubles it uses float comparison via -\code{\link[dplyr:near]{dplyr::near}}, otherwise it uses equality. \code{NA}'s and \code{NaN}'s are -considered equal to themselves and each other. +compares with itself. For doubles it compares whether the absolute +difference is \verb{<= abs_tol}; otherwise it uses equality. \code{NA}'s and \code{NaN}'s +are considered equal to themselves and each other. } \keyword{internal} diff --git a/man/removed_by_compactify.Rd b/man/removed_by_compactify.Rd index 2f129888..9ea4e539 100644 --- a/man/removed_by_compactify.Rd +++ b/man/removed_by_compactify.Rd @@ -4,7 +4,7 @@ \alias{removed_by_compactify} \title{get the entries that \code{compactify} would remove} \usage{ -removed_by_compactify(df, keys, tolerance) +removed_by_compactify(df, keys, abs_tol) } \description{ get the entries that \code{compactify} would remove diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 39a72b9a..2d7247be 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -15,8 +15,8 @@ revision_summary( few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, - compactify_tol = .Machine$double.eps^0.5, - should_compactify = TRUE + should_compactify = TRUE, + compactify_abs_tol = 0 ) } \arguments{ @@ -62,10 +62,10 @@ for the scale of the dataset.} the relative spread fraction used to characterize revisions which don't actually change very much. Default is .1, or 10\% of the final value} -\item{compactify_tol}{float, used if \code{drop_nas=TRUE}, it determines the -threshold for when two floats are considered identical.} - \item{should_compactify}{bool. Compactify if \code{TRUE}.} + +\item{compactify_abs_tol}{float, used if \code{drop_nas=TRUE}, it determines the +threshold for when two floats are considered identical.} } \description{ \code{revision_summary} removes all missing values (if requested), and then diff --git a/tests/testthat/_snaps/archive.md b/tests/testthat/_snaps/archive.md index 6e010da0..e81b2d6a 100644 --- a/tests/testthat/_snaps/archive.md +++ b/tests/testthat/_snaps/archive.md @@ -2,12 +2,4 @@ Code res <- dumb_ex %>% as_epi_archive() - Condition - Warning: - Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space: - Key: - geo_value time_value value version - - 1: ca 2020-01-01 1 2020-01-02 - Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. To silence this warning but keep compactification, you can pass `compactify=TRUE` when constructing the archive. diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index d05fe0b3..26b97c4a 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -52,14 +52,14 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT)) +dt_null <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT)) -test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive(dt, compactify = NULL)) +test_that('Warning for LOCF with compactify as "message"', { + expect_message(as_epi_archive(dt, compactify = "message")) }) test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA) + expect_no_message(as_epi_archive(dt[1:5], compactify = "message")) }) test_that("LOCF values are ignored with compactify=FALSE", { diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 0101100a..a1e7a108 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -23,10 +23,10 @@ rows to remain. There are three different values that can be assigned to `compactify`: -* No argument: if there are LOCF-redundant rows, removes them and issues a - warning with some information about what rows were removed -* `TRUE`: removes any LOCF-redundant rows without any warning or other feedback -* `FALSE`: keeps any LOCF-redundant rows without any warning or other feedback +* `TRUE` (default): removes any LOCF-redundant rows without any message or other feedback +* `FALSE`: keeps any LOCF-redundant rows without any message or other feedback +* `"message"`: if there are LOCF-redundant rows, removes them and produces a + message with some information about what rows were removed For this example, we have one chart using LOCF values, while another doesn't use them to illustrate LOCF. Notice how the head of the first dataset differs diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index 34f5a74b..904cb7c2 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -87,7 +87,7 @@ format, with `issue` playing the role of `version`. We can now use ```{r} dv_archive <- dv %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv_archive ``` @@ -216,9 +216,9 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() -dv_cases_archive <- epix_merge(dv_archive, y, sync = "locf", compactify = TRUE) +dv_cases_archive <- epix_merge(dv_archive, y, sync = "locf") print(dv_cases_archive) ``` diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 7435f704..d979d685 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -168,7 +168,7 @@ dv <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, issue, percent_cli = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv ``` @@ -182,7 +182,7 @@ library(ggplot2) dv <- archive_cases_dv_subset$DT %>% select(-case_rate_7d_av) %>% tidyr::drop_na() %>% - as_epi_archive(compactify = TRUE) + as_epi_archive() dv ```