Skip to content

Commit

Permalink
compactify_abs_tol: use on is.numeric, don't use on edf keys
Browse files Browse the repository at this point in the history
  • Loading branch information
brookslogan committed Jan 25, 2025
1 parent c0d6071 commit 4c73e98
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 21 deletions.
62 changes: 46 additions & 16 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -397,11 +397,17 @@ validate_epi_archive <- function(x) {
#' @importFrom dplyr filter
apply_compactify <- function(df, keys, abs_tol = 0) {
df %>%
arrange(!!!keys) %>%
filter(if_any(
c(everything(), -version), # all non-version columns
~ !is_locf(., abs_tol)
))
arrange(!!!keys) %>% # in case not archive DT & key
filter(
if_any(
all_of(keys) & !"version", # epikeytimes
~ !is_locf(.x, abs_tol, TRUE)
) |
if_any(
!all_of(keys), # measurement columns
~ !is_locf(.x, abs_tol, FALSE)
)
)
}

#' get the entries that `compactify` would remove
Expand All @@ -410,23 +416,47 @@ apply_compactify <- function(df, keys, abs_tol = 0) {
removed_by_compactify <- function(df, keys, abs_tol) {
df %>%
arrange(!!!keys) %>%
filter(if_all(
c(everything(), -version),
~ is_locf(., abs_tol)
)) # nolint: object_usage_linter
filter(
if_all(
all_of(keys) & !"version", # epikeytimes
~ is_locf(.x, abs_tol, TRUE)
) &
if_all(
!all_of(keys), # measurement columns
~ is_locf(.x, abs_tol, FALSE)
)
)
}

#' Checks to see if a value in a vector is LOCF
#' @description
#' LOCF meaning last observation carried forward. lags the vector by 1, then
#' compares with itself. For doubles it compares whether the absolute
#' difference is `<= abs_tol`; otherwise it uses equality. `NA`'s and `NaN`'s
#' are considered equal to themselves and each other.
#' @description LOCF meaning last observation carried forward (to later
#' versions). Lags the vector by 1, then compares with itself. If `is_key` is
#' `TRUE`, only values that are exactly the same between the lagged and
#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
#' of numbers ([`base::is.numeric`]), then approximate equality will be used,
#' checking whether the absolute difference between each pair of entries is
#' `<= abs_tol`; if `vec` is something else, then exact equality is used
#' instead.
#'
#' @details
#'
#' We include epikey-time columns in LOCF comparisons as part of an optimization
#' to avoid slower grouped operations while still ensuring that the first
#' observation for each time series will not be marked as LOCF. We test these
#' key columns for exact equality to prevent chopping off consecutive
#' time_values during flat periods when `abs_tol` is high.
#'
#' We use exact equality for non-`is.numeric` double/integer columns such as
#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
#' used as part of re-indexing or grouping procedures, and we don't want to
#' change the number of groups for those operations when we remove LOCF data
#' during compactification.
#'
#' @importFrom dplyr lag if_else
#' @keywords internal
is_locf <- function(vec, abs_tol) { # nolint: object_usage_linter
is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
lag_vec <- dplyr::lag(vec)
if (typeof(vec) == "double") {
if (is.numeric(vec) && !is_key) {
res <- if_else(
!is.na(vec) & !is.na(lag_vec),
abs(vec - lag_vec) <= abs_tol,
Expand Down
27 changes: 22 additions & 5 deletions man/is_locf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions tests/testthat/test-compactify.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF
dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT)
dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT)
dt_message <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT))
dt_0 <- as_tibble(as_epi_archive(dt, compactify = TRUE, compactify_abs_tol = 0)$DT)

test_that('Warning for LOCF with compactify as "message"', {
expect_message(as_epi_archive(dt, compactify = "message"))
Expand All @@ -71,6 +72,9 @@ test_that("LOCF values are taken out with compactify=TRUE", {

expect_identical(dt_true, dt_message)
expect_identical(dt_message, dt_test)

# Tolerance is nonstrict and tolerance 0 still compactifies:
expect_identical(dt_0, dt_test)
})

test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", {
Expand Down Expand Up @@ -105,3 +109,48 @@ test_that("compactify does not alter the default clobberable and observed versio
expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start)
expect_identical(ea_true$versions_end, ea_false$versions_end)
})

test_that("Large compactify_abs_tol does not drop edf keys", {
# several epikeytimes, each with a single version
x <- tibble(
geo_value = 1,
time_value = 1:5,
version = 11:15,
value = 1001:1005
)
# We shouldn't drop epikeytimes:
expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x)
})

test_that("Large compactify_abs_tol does not apply to non-is.numeric columns", {
# one epikeytime with many versions:
d <- as.Date("2000-01-01")
x <- tibble(
geo_value = 1,
time_value = d + 1,
version = d + 11:15,
lag = version - time_value, # non-is.numeric
value = 1001:1005
)
expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x)
})

test_that("Large compactify_abs_tol works on value columns", {
# one epikeytime with many versions:
d <- as.Date("2000-01-01")
x <- tibble(
geo_value = 1,
time_value = d + 1,
version = d + 11:15,
value = 1001:1005
)
expect_equal(
as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)),
tibble(
geo_value = 1,
time_value = d + 1,
version = d + 11, # XXX do we want d + c(11,14) instead?
value = 1001 # XXX do we want c(1001, 1004) instead?
)
)
})

0 comments on commit 4c73e98

Please sign in to comment.