Skip to content

Commit

Permalink
Closes #183 Add xportr_split Functionality (#235)
Browse files Browse the repository at this point in the history
* add in split function

* Add tests and functionality

* Testing and doc updates

* Update tests to correct skip condition and add cli_warn

* Update with lint and style

* exclude large file size from cov

* Update R/split.R

Co-authored-by: Ben Straub <ben.x.straub@gsk.com>

* Update R/split.R

Co-authored-by: Ben Straub <ben.x.straub@gsk.com>

* Update R/write.R

Co-authored-by: Ben Straub <ben.x.straub@gsk.com>

* Update for passing R CMD Check

* Update R/split.R

---------

Co-authored-by: Ben Straub <ben.x.straub@gsk.com>
  • Loading branch information
elimillera and bms63 authored Feb 26, 2024
1 parent c5468df commit 63c52b1
Show file tree
Hide file tree
Showing 11 changed files with 209 additions and 4 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(xportr_length)
export(xportr_metadata)
export(xportr_options)
export(xportr_order)
export(xportr_split)
export(xportr_type)
export(xportr_write)
export(xpt_validate)
Expand All @@ -32,6 +33,7 @@ importFrom(cli,cli_alert_success)
importFrom(cli,cli_div)
importFrom(cli,cli_h2)
importFrom(cli,cli_text)
importFrom(cli,cli_warn)
importFrom(dplyr,across)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* Added `xportr.character_metadata_types` and `xportr.numeric_metadata_types` to list the metadata types that are character or numeric. Updated `xportr.character_types` and `xportr.numeric_types` to list only the R types that are character and the R types that are numeric. This ensures that all R types, including dates, are now managed by xportr_type. If the R type differs from the metadata type, the variable is coerced (#161)..

* Adds argument assertions to public functions using `{checkmate}` (#175)
* `xportr_split()` is a new function that allows users to split a dataset into multiple output files based on a variable. (#183)

* `xportr_metadata()` can set `verbose` for a whole pipeline, i.e. setting `verbose` in `xportr_metadata()` will populate to all `xportr` functions. (#151)

Expand Down
2 changes: 1 addition & 1 deletion R/length.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ xportr_length <- function(.df,
length_meta = as.numeric(length_msg[[paste0(variable_length, ".y")]])
) %>%
filter(length_df < length_meta) %>%
select(all_of(variable_name), length_df, length_meta)
select(any_of(c(variable_name, "length_df", "length_meta")))

max_length_msg(length_msg, verbose)
}
Expand Down
35 changes: 35 additions & 0 deletions R/split.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#' Split xpt file output
#'
#' Per the FDA Study Data Technical Conformance
#' Guide(https://www.fda.gov/media/88173/download) section 3.3.2, dataset files
#' sizes shouldn't exceed 5 GB. If datasets are large enough, they should be
#' split based on a variable. For example, laboratory readings in `ADLB` can be
#' split by `LBCAT` to split up hematology and chemistry data.
#'
#' This function will tell `xportr_write()` to split the data frame based on the
#' variable passed in `split_by`. When written, the file name will be prepended
#' with a number for uniqueness. These files should be noted in the Reviewer Guides per
#' CDISC guidance to note how you split your files.
#'
#' @inheritParams xportr_length
#' @param split_by A quoted variable that will be passed to `base::split()`.
#'
#' @return A data frame with an additional attribute added so `xportr_write()`
#' knows how to split the data frame.
#'
#'
#' @export
#'
#' @examples
#'
#' adlb <- data.frame(
#' USUBJID = c(1001, 1002, 1003),
#' LBCAT = c("HEMATOLOGY", "HEMATOLOGY", "CHEMISTRY")
#' )
#'
#' adsl <- xportr_split(adsl, "LBCAT")
xportr_split <- function(.df, split_by = NULL) {
attr(.df, "_xportr.split_by_") <- split_by

return(.df)
}
20 changes: 20 additions & 0 deletions R/utils-xportr.R
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,23 @@ assert_metadata <- function(metadata,
#' Internal choices for verbose option
#' @noRd
.internal_verbose_choices <- c("none", "warn", "message", "stop")

#' Internal function to check xpt file size
#' @noRd
check_xpt_size <- function(path) {
fs <- file.size(path)

fs_string <- c(
"i" = paste0("xpt file size is: ", round(fs / 1e+9, 2)), " GB.",
"x" = paste0(
"XPT file sizes should not exceed 5G. It is",
" recommended you call `xportr_split` to split the file into smaller files."
)
)

if (fs > 5e+9) {
cli_warn(fs_string, class = "xportr.xpt_size") # nocov
}

invisible(NULL)
}
40 changes: 39 additions & 1 deletion R/write.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,26 @@ xportr_write <- function(.df,
data <- as.data.frame(.df)

tryCatch(
write_xpt(data, path = path, version = 5, name = name),
{
# If data is not split, data is just written out
if (is.null(attr(data, "_xportr.split_by_"))) {
write_xpt(data, path = path, version = 5, name = name)
check_xpt_size(path)
} else {
# If data is split, perform the split and get an index for the for loop
split_data <- split(data, data[[attr(data, "_xportr.split_by_")]])
split_index <- unique(data[[attr(data, "_xportr.split_by_")]])
paths <- get_split_path(path, seq_along(split_index))
# Iterate on the unique values of the split
for (i in seq_along(split_index)) {
# Write out the split data, `get_split_path` will determine file name
write_xpt(split_data[[i]],
path = paths[i], version = 5, name = name
)
check_xpt_size(paths[i])
}
}
},
error = function(err) {
rlang::abort(
paste0(
Expand All @@ -118,3 +137,22 @@ xportr_write <- function(.df,

invisible(data)
}

#' Figure out path for split data.
#'
#' Will append a number before the file extension to indicate the split.
#'
#' i.e. `adsl.xpt` will become `adsl1.xpt` and `adsl2.xpt`
#'
#' @param path Path variable specified by user
#' @param ind Index of split variable
#'
#' @noRd
get_split_path <- function(path, ind) {
paste0(
tools::file_path_sans_ext(path),
ind,
".",
tools::file_ext(path)
)
}
4 changes: 2 additions & 2 deletions R/xportr-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
#' tribble if_else across as_tibble
#' @importFrom glue glue glue_collapse
#' @importFrom cli cli_alert_info cli_h2 cli_alert_success cli_div cli_text
#' cli_alert_danger
#' cli_alert_danger cli_warn
#' @importFrom tidyselect all_of any_of where
#' @importFrom utils capture.output str tail packageVersion
#' @importFrom stringr str_detect str_extract str_replace str_replace_all
Expand All @@ -125,7 +125,7 @@ globalVariables(c(
"abbr_parsed", "abbr_stem", "adj_orig", "adj_parsed", "col_pos", "dict_varname",
"lower_original_varname", "my_minlength", "num_st_ind", "original_varname",
"renamed_n", "renamed_var", "use_bundle", "viable_start", "type.x", "type.y",
"variable", "length.x", "lenght.y"
"variable", "length.x", "lenght.y", "e", "length_df", "length_meta"
))

# The following block is used by usethis to automatically manage
Expand Down
5 changes: 5 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ADAE
ADSL
ADaM
adlb
AE
Atorus
BMI
Expand All @@ -10,6 +11,7 @@ Completers
DCREASCD
DM
Didenko
fda
GSK
JPT
MMSE
Expand Down Expand Up @@ -55,3 +57,6 @@ validator
validators
xportr's
xpt
https
lbcat
www
1 change: 1 addition & 0 deletions man/xportr-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions man/xportr_split.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 64 additions & 0 deletions tests/testthat/test-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ data_to_save <- function() {
as_tibble()
}

# Skip large file tests unless explicitly requested
test_large_files <- Sys.getenv("XPORTR.TEST_LARGE_FILES", FALSE)

test_that("xportr_write: exported data can be saved to a file", {
skip_if_not_installed("withr")
tmp <- withr::local_file("xyz.xpt")
Expand Down Expand Up @@ -194,3 +197,64 @@ test_that("xportr_write: Capture errors by haven and report them as such", {
"Error reported by haven"
)
})

test_that("xportr_write: `split_by` attribute is used to split the data", {
tmpdir <- tempdir()
tmp <- file.path(tmpdir, "xyz.xpt")

on.exit(unlink(tmpdir))

dts <- data_to_save()
dts %>%
xportr_split(split_by = "X") %>%
xportr_write(path = tmp)

expect_true(
file.exists(file.path(tmpdir, "xyz1.xpt"))
)
expect_true(
file.exists(file.path(tmpdir, "xyz2.xpt"))
)
expect_true(
file.exists(file.path(tmpdir, "xyz3.xpt"))
)
expect_equal(
read_xpt(file.path(tmpdir, "xyz1.xpt")) %>%
extract2("X") %>%
unique() %>%
length(),
1
)
expect_equal(
read_xpt(file.path(tmpdir, "xyz2.xpt")) %>%
extract2("X") %>%
unique() %>%
length(),
1
)
expect_equal(
read_xpt(file.path(tmpdir, "xyz3.xpt")) %>%
extract2("X") %>%
unique() %>%
length(),
1
)
})

test_that("xportr_write: Large file sizes are reported and warned", {
skip_if_not(test_large_files)
tmpdir <- tempdir()
tmp <- file.path(tmpdir, "xyz.xpt")

on.exit(unlink(tmpdir))

# Large_df should be at least 5GB
large_df <- do.call(
data.frame, replicate(80000, rep("large", 80000), simplify = FALSE)
)

expect_warning(
xportr_write(large_df, path = tmp),
class = "xportr.xpt_size"
)
})

0 comments on commit 63c52b1

Please sign in to comment.