Closes #183 Add xportr_split Functionality (#235)

* add in split function * Add tests and functionality * Testing and doc updates * Update tests to correct skip condition and add cli_warn * Update with lint and style * exclude large file size from cov * Update R/split.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/split.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update R/write.R Co-authored-by: Ben Straub <ben.x.straub@gsk.com> * Update for passing R CMD Check * Update R/split.R --------- Co-authored-by: Ben Straub <ben.x.straub@gsk.com>
atorus-research · Feb 26, 2024 · 63c52b1 · 63c52b1
1 parent c5468df
commit 63c52b1
Show file tree

Hide file tree

Showing 11 changed files with 209 additions and 4 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(xportr_length)
 export(xportr_metadata)
 export(xportr_options)
 export(xportr_order)
+export(xportr_split)
 export(xportr_type)
 export(xportr_write)
 export(xpt_validate)
@@ -32,6 +33,7 @@ importFrom(cli,cli_alert_success)
 importFrom(cli,cli_div)
 importFrom(cli,cli_h2)
 importFrom(cli,cli_text)
+importFrom(cli,cli_warn)
 importFrom(dplyr,across)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)

diff --git a/NEWS.md b/NEWS.md
@@ -21,6 +21,7 @@
 * Added `xportr.character_metadata_types` and `xportr.numeric_metadata_types` to list the metadata types that are character or numeric. Updated `xportr.character_types` and `xportr.numeric_types` to list only the R types that are character and the R types that are numeric. This ensures that all R types, including dates, are now managed by xportr_type. If the R type differs from the metadata type, the variable is coerced (#161)..
 
 * Adds argument assertions to public functions using `{checkmate}` (#175)
+* `xportr_split()` is a new function that allows users to split a dataset into multiple output files based on a variable. (#183)
 
 * `xportr_metadata()` can set `verbose` for a whole pipeline, i.e. setting `verbose` in `xportr_metadata()` will populate to all `xportr` functions.  (#151)
 

diff --git a/R/length.R b/R/length.R
@@ -162,7 +162,7 @@ xportr_length <- function(.df,
         length_meta = as.numeric(length_msg[[paste0(variable_length, ".y")]])
       ) %>%
       filter(length_df < length_meta) %>%
-      select(all_of(variable_name), length_df, length_meta)
+      select(any_of(c(variable_name, "length_df", "length_meta")))
 
     max_length_msg(length_msg, verbose)
   }

diff --git a/R/split.R b/R/split.R
@@ -0,0 +1,35 @@
+#' Split xpt file output
+#'
+#' Per the FDA Study Data Technical Conformance
+#' Guide(https://www.fda.gov/media/88173/download) section 3.3.2, dataset files
+#' sizes shouldn't exceed 5 GB. If datasets are large enough, they should be
+#' split based on a variable. For example, laboratory readings in `ADLB` can be
+#' split by `LBCAT` to split up hematology and chemistry data.
+#'
+#' This function will tell `xportr_write()` to split the data frame based on the
+#' variable passed in `split_by`. When written, the file name will be prepended
+#' with a number for uniqueness. These files should be noted in the Reviewer Guides per
+#' CDISC guidance to note how you split your files.
+#'
+#' @inheritParams xportr_length
+#' @param split_by A quoted variable that will be passed to `base::split()`.
+#'
+#' @return A data frame with an additional attribute added so `xportr_write()`
+#'   knows how to split the data frame.
+#'
+#'
+#' @export
+#'
+#' @examples
+#'
+#' adlb <- data.frame(
+#'   USUBJID = c(1001, 1002, 1003),
+#'   LBCAT = c("HEMATOLOGY", "HEMATOLOGY", "CHEMISTRY")
+#' )
+#'
+#' adsl <- xportr_split(adsl, "LBCAT")
+xportr_split <- function(.df, split_by = NULL) {
+  attr(.df, "_xportr.split_by_") <- split_by
+
+  return(.df)
+}
diff --git a/R/utils-xportr.R b/R/utils-xportr.R
@@ -440,3 +440,23 @@ assert_metadata <- function(metadata,
 #' Internal choices for verbose option
 #' @noRd
 .internal_verbose_choices <- c("none", "warn", "message", "stop")
+
+#' Internal function to check xpt file size
+#' @noRd
+check_xpt_size <- function(path) {
+  fs <- file.size(path)
+
+  fs_string <- c(
+    "i" = paste0("xpt file size is: ", round(fs / 1e+9, 2)), " GB.",
+    "x" = paste0(
+      "XPT file sizes should not exceed 5G. It is",
+      " recommended you call `xportr_split` to split the file into smaller files."
+    )
+  )
+
+  if (fs > 5e+9) {
+    cli_warn(fs_string, class = "xportr.xpt_size") # nocov
+  }
+
+  invisible(NULL)
+}
diff --git a/R/write.R b/R/write.R
@@ -105,7 +105,26 @@ xportr_write <- function(.df,
   data <- as.data.frame(.df)
 
   tryCatch(
-    write_xpt(data, path = path, version = 5, name = name),
+    {
+      # If data is not split, data is just written out
+      if (is.null(attr(data, "_xportr.split_by_"))) {
+        write_xpt(data, path = path, version = 5, name = name)
+        check_xpt_size(path)
+      } else {
+        # If data is split, perform the split and get an index for the for loop
+        split_data <- split(data, data[[attr(data, "_xportr.split_by_")]])
+        split_index <- unique(data[[attr(data, "_xportr.split_by_")]])
+        paths <- get_split_path(path, seq_along(split_index))
+        # Iterate on the unique values of the split
+        for (i in seq_along(split_index)) {
+          # Write out the split data, `get_split_path` will determine file name
+          write_xpt(split_data[[i]],
+            path = paths[i], version = 5, name = name
+          )
+          check_xpt_size(paths[i])
+        }
+      }
+    },
     error = function(err) {
       rlang::abort(
         paste0(
@@ -118,3 +137,22 @@ xportr_write <- function(.df,
 
   invisible(data)
 }
+
+#' Figure out path for split data.
+#'
+#' Will append a number before the file extension to indicate the split.
+#'
+#' i.e. `adsl.xpt` will become `adsl1.xpt` and `adsl2.xpt`
+#'
+#' @param path Path variable specified by user
+#' @param ind Index of split variable
+#'
+#' @noRd
+get_split_path <- function(path, ind) {
+  paste0(
+    tools::file_path_sans_ext(path),
+    ind,
+    ".",
+    tools::file_ext(path)
+  )
+}
diff --git a/R/xportr-package.R b/R/xportr-package.R
@@ -107,7 +107,7 @@
 #'   tribble if_else across as_tibble
 #' @importFrom glue glue glue_collapse
 #' @importFrom cli cli_alert_info cli_h2 cli_alert_success cli_div cli_text
-#'   cli_alert_danger
+#'   cli_alert_danger cli_warn
 #' @importFrom tidyselect all_of any_of where
 #' @importFrom utils capture.output str tail packageVersion
 #' @importFrom stringr str_detect str_extract str_replace str_replace_all
@@ -125,7 +125,7 @@ globalVariables(c(
   "abbr_parsed", "abbr_stem", "adj_orig", "adj_parsed", "col_pos", "dict_varname",
   "lower_original_varname", "my_minlength", "num_st_ind", "original_varname",
   "renamed_n", "renamed_var", "use_bundle", "viable_start", "type.x", "type.y",
-  "variable", "length.x", "lenght.y"
+  "variable", "length.x", "lenght.y", "e", "length_df", "length_meta"
 ))
 
 # The following block is used by usethis to automatically manage

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -1,6 +1,7 @@
 ADAE
 ADSL
 ADaM
+adlb
 AE
 Atorus
 BMI
@@ -10,6 +11,7 @@ Completers
 DCREASCD
 DM
 Didenko
+fda
 GSK
 JPT
 MMSE
@@ -55,3 +57,6 @@ validator
 validators
 xportr's
 xpt
+https
+lbcat
+www
diff --git a/man/xportr-package.Rd b/man/xportr-package.Rd
diff --git a/man/xportr_split.Rd b/man/xportr_split.Rd
diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R
@@ -4,6 +4,9 @@ data_to_save <- function() {
     as_tibble()
 }
 
+# Skip large file tests unless explicitly requested
+test_large_files <- Sys.getenv("XPORTR.TEST_LARGE_FILES", FALSE)
+
 test_that("xportr_write: exported data can be saved to a file", {
   skip_if_not_installed("withr")
   tmp <- withr::local_file("xyz.xpt")
@@ -194,3 +197,64 @@ test_that("xportr_write: Capture errors by haven and report them as such", {
     "Error reported by haven"
   )
 })
+
+test_that("xportr_write: `split_by` attribute is used to split the data", {
+  tmpdir <- tempdir()
+  tmp <- file.path(tmpdir, "xyz.xpt")
+
+  on.exit(unlink(tmpdir))
+
+  dts <- data_to_save()
+  dts %>%
+    xportr_split(split_by = "X") %>%
+    xportr_write(path = tmp)
+
+  expect_true(
+    file.exists(file.path(tmpdir, "xyz1.xpt"))
+  )
+  expect_true(
+    file.exists(file.path(tmpdir, "xyz2.xpt"))
+  )
+  expect_true(
+    file.exists(file.path(tmpdir, "xyz3.xpt"))
+  )
+  expect_equal(
+    read_xpt(file.path(tmpdir, "xyz1.xpt")) %>%
+      extract2("X") %>%
+      unique() %>%
+      length(),
+    1
+  )
+  expect_equal(
+    read_xpt(file.path(tmpdir, "xyz2.xpt")) %>%
+      extract2("X") %>%
+      unique() %>%
+      length(),
+    1
+  )
+  expect_equal(
+    read_xpt(file.path(tmpdir, "xyz3.xpt")) %>%
+      extract2("X") %>%
+      unique() %>%
+      length(),
+    1
+  )
+})
+
+test_that("xportr_write: Large file sizes are reported and warned", {
+  skip_if_not(test_large_files)
+  tmpdir <- tempdir()
+  tmp <- file.path(tmpdir, "xyz.xpt")
+
+  on.exit(unlink(tmpdir))
+
+  # Large_df should be at least 5GB
+  large_df <- do.call(
+    data.frame, replicate(80000, rep("large", 80000), simplify = FALSE)
+  )
+
+  expect_warning(
+    xportr_write(large_df, path = tmp),
+    class = "xportr.xpt_size"
+  )
+})