Skip to content

Commit 4c73e98

Browse files
committed
compactify_abs_tol: use on is.numeric, don't use on edf keys
1 parent c0d6071 commit 4c73e98

File tree

3 files changed

+117
-21
lines changed

3 files changed

+117
-21
lines changed

R/archive.R

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -397,11 +397,17 @@ validate_epi_archive <- function(x) {
397397
#' @importFrom dplyr filter
398398
apply_compactify <- function(df, keys, abs_tol = 0) {
399399
df %>%
400-
arrange(!!!keys) %>%
401-
filter(if_any(
402-
c(everything(), -version), # all non-version columns
403-
~ !is_locf(., abs_tol)
404-
))
400+
arrange(!!!keys) %>% # in case not archive DT & key
401+
filter(
402+
if_any(
403+
all_of(keys) & !"version", # epikeytimes
404+
~ !is_locf(.x, abs_tol, TRUE)
405+
) |
406+
if_any(
407+
!all_of(keys), # measurement columns
408+
~ !is_locf(.x, abs_tol, FALSE)
409+
)
410+
)
405411
}
406412

407413
#' get the entries that `compactify` would remove
@@ -410,23 +416,47 @@ apply_compactify <- function(df, keys, abs_tol = 0) {
410416
removed_by_compactify <- function(df, keys, abs_tol) {
411417
df %>%
412418
arrange(!!!keys) %>%
413-
filter(if_all(
414-
c(everything(), -version),
415-
~ is_locf(., abs_tol)
416-
)) # nolint: object_usage_linter
419+
filter(
420+
if_all(
421+
all_of(keys) & !"version", # epikeytimes
422+
~ is_locf(.x, abs_tol, TRUE)
423+
) &
424+
if_all(
425+
!all_of(keys), # measurement columns
426+
~ is_locf(.x, abs_tol, FALSE)
427+
)
428+
)
417429
}
418430

419431
#' Checks to see if a value in a vector is LOCF
420-
#' @description
421-
#' LOCF meaning last observation carried forward. lags the vector by 1, then
422-
#' compares with itself. For doubles it compares whether the absolute
423-
#' difference is `<= abs_tol`; otherwise it uses equality. `NA`'s and `NaN`'s
424-
#' are considered equal to themselves and each other.
432+
#' @description LOCF meaning last observation carried forward (to later
433+
#' versions). Lags the vector by 1, then compares with itself. If `is_key` is
434+
#' `TRUE`, only values that are exactly the same between the lagged and
435+
#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
436+
#' of numbers ([`base::is.numeric`]), then approximate equality will be used,
437+
#' checking whether the absolute difference between each pair of entries is
438+
#' `<= abs_tol`; if `vec` is something else, then exact equality is used
439+
#' instead.
440+
#'
441+
#' @details
442+
#'
443+
#' We include epikey-time columns in LOCF comparisons as part of an optimization
444+
#' to avoid slower grouped operations while still ensuring that the first
445+
#' observation for each time series will not be marked as LOCF. We test these
446+
#' key columns for exact equality to prevent chopping off consecutive
447+
#' time_values during flat periods when `abs_tol` is high.
448+
#'
449+
#' We use exact equality for non-`is.numeric` double/integer columns such as
450+
#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
451+
#' used as part of re-indexing or grouping procedures, and we don't want to
452+
#' change the number of groups for those operations when we remove LOCF data
453+
#' during compactification.
454+
#'
425455
#' @importFrom dplyr lag if_else
426456
#' @keywords internal
427-
is_locf <- function(vec, abs_tol) { # nolint: object_usage_linter
457+
is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
428458
lag_vec <- dplyr::lag(vec)
429-
if (typeof(vec) == "double") {
459+
if (is.numeric(vec) && !is_key) {
430460
res <- if_else(
431461
!is.na(vec) & !is.na(lag_vec),
432462
abs(vec - lag_vec) <= abs_tol,

man/is_locf.Rd

Lines changed: 22 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-compactify.R

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ dt <- row_replace(dt, 74, 73, 74) # Not LOCF
5353
dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT)
5454
dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT)
5555
dt_message <- suppressMessages(as_tibble(as_epi_archive(dt, compactify = "message")$DT))
56+
dt_0 <- as_tibble(as_epi_archive(dt, compactify = TRUE, compactify_abs_tol = 0)$DT)
5657

5758
test_that('Warning for LOCF with compactify as "message"', {
5859
expect_message(as_epi_archive(dt, compactify = "message"))
@@ -71,6 +72,9 @@ test_that("LOCF values are taken out with compactify=TRUE", {
7172

7273
expect_identical(dt_true, dt_message)
7374
expect_identical(dt_message, dt_test)
75+
76+
# Tolerance is nonstrict and tolerance 0 still compactifies:
77+
expect_identical(dt_0, dt_test)
7478
})
7579

7680
test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", {
@@ -105,3 +109,48 @@ test_that("compactify does not alter the default clobberable and observed versio
105109
expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start)
106110
expect_identical(ea_true$versions_end, ea_false$versions_end)
107111
})
112+
113+
test_that("Large compactify_abs_tol does not drop edf keys", {
114+
# several epikeytimes, each with a single version
115+
x <- tibble(
116+
geo_value = 1,
117+
time_value = 1:5,
118+
version = 11:15,
119+
value = 1001:1005
120+
)
121+
# We shouldn't drop epikeytimes:
122+
expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x)
123+
})
124+
125+
test_that("Large compactify_abs_tol does not apply to non-is.numeric columns", {
126+
# one epikeytime with many versions:
127+
d <- as.Date("2000-01-01")
128+
x <- tibble(
129+
geo_value = 1,
130+
time_value = d + 1,
131+
version = d + 11:15,
132+
lag = version - time_value, # non-is.numeric
133+
value = 1001:1005
134+
)
135+
expect_equal(as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)), x)
136+
})
137+
138+
test_that("Large compactify_abs_tol works on value columns", {
139+
# one epikeytime with many versions:
140+
d <- as.Date("2000-01-01")
141+
x <- tibble(
142+
geo_value = 1,
143+
time_value = d + 1,
144+
version = d + 11:15,
145+
value = 1001:1005
146+
)
147+
expect_equal(
148+
as_tibble(as.data.frame(as_epi_archive(x, compactify_abs_tol = 3)$DT)),
149+
tibble(
150+
geo_value = 1,
151+
time_value = d + 1,
152+
version = d + 11, # XXX do we want d + c(11,14) instead?
153+
value = 1001 # XXX do we want c(1001, 1004) instead?
154+
)
155+
)
156+
})

0 commit comments

Comments
 (0)