diff --git a/NEWS.md b/NEWS.md index 3b93a7df..e798ce6a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,11 @@ -# xportr 0.4.0 +# xportr 0.5.0 -## xportr development version +## New Feature * New argument in `xportr_write()` allows users to specify the maximum file size (in GB) of their exported xpt files. (#268) +# xportr 0.4.0 + ## New Features * All core functions can be run together by using new function `xportr()` (#137) diff --git a/R/write.R b/R/write.R index cfb6c437..9326c9ac 100644 --- a/R/write.R +++ b/R/write.R @@ -7,7 +7,8 @@ #' @param .df A data frame to write. #' @param path Path where transport file will be written. File name sans will be #' used as `xpt` name. -#' @param max_size_gb Maximum size in GB of the exported files. +#' @param max_size_gb Maximum size in GB of the exported file(s). If size of xpt file exceeds the specified maximum, +#' it will split the data frame into multiple exported chunk(s). #' @param label `r lifecycle::badge("deprecated")` Previously used to to set the Dataset label. #' Use the `metadata` argument to set the dataset label. #' @param strict_checks If TRUE, xpt validation will report errors and not write @@ -75,6 +76,7 @@ xportr_write <- function(.df, assert_data_frame(.df) assert_string(path) + checkmate::assert_numeric(max_size_gb, null.ok = TRUE) assert_metadata(metadata, null.ok = TRUE) assert_logical(strict_checks) @@ -90,6 +92,10 @@ xportr_write <- function(.df, assert(".df file name must be 8 characters or less.", .var.name = "path") } + if (is.numeric(max_size_gb) && max_size_gb <= 0) { + assert("max_size_gb must be NULL or a numeric value greater than 0.", .var.name = "max_size_gb") + } + checks <- xpt_validate(.df) if (stringr::str_detect(name, "[^a-zA-Z0-9]")) { @@ -160,43 +166,58 @@ export_to_xpt <- function(.df, path, max_size_gb, file_prefix) { # Convert GB to bytes max_size_bytes <- max_size_gb * 1000^3 - total_rows <- nrow(.df) - chunk_counter <- 1 - row_start <- 1 + temp_file <- tempfile() + write_xpt(.df, temp_file) + + file_size <- file.info(temp_file)$size + unlink(temp_file) # Clean up the temporary file + dir_path <- dirname(path) - while (row_start <= total_rows) { - # Binary search to find the maximum number of rows that fit within the size limit - low <- row_start - high <- total_rows - best_fit <- row_start - - while (low <= high) { - mid <- floor((low + high) / 2) - temp_file <- tempfile() - write_xpt(.df[row_start:mid, ], temp_file) - file_size <- file.info(temp_file)$size - - if (file_size <= max_size_bytes) { - best_fit <- mid - low <- mid + 1 - } else { - high <- mid - 1 + if (file_size <= max_size_bytes) { + chunk_counter <- 2 + file_name <- sprintf("%s.xpt", file_prefix) + path <- file.path(dir_path, file_name) + write_xpt(.df, path = path, version = 5, name = file_prefix) + } else { + total_rows <- nrow(.df) + row_start <- 1 + chunk_counter <- 1 + + while (row_start <= total_rows) { + # Binary search to find the maximum number of rows that fit within the size limit + low <- row_start + high <- total_rows + best_fit <- row_start + + while (low <= high) { + mid <- floor((low + high) / 2) + write_xpt(.df[row_start:mid, ], temp_file) + file_size <- file.info(temp_file)$size + + if (file_size <= max_size_bytes) { + best_fit <- mid + low <- mid + 1 + } else { + high <- mid - 1 + } + + unlink(temp_file) # Clean up the temporary file } - unlink(temp_file) # Clean up the temporary file - } + # Write the best fitting chunk to the final file + chunk <- .df[row_start:best_fit, ] - # Write the best fitting chunk to the final file - file_name <- sprintf("%s%d.xpt", file_prefix, chunk_counter) - path <- file.path(dir_path, file_name) + file_name <- sprintf("%s%d.xpt", file_prefix, chunk_counter) + path <- file.path(dir_path, file_name) - write_xpt(.df[row_start:best_fit, ], path = path, version = 5, name = file_prefix) + write_xpt(chunk, path = path, version = 5, name = file_prefix) - # Update counters - row_start <- best_fit + 1 - chunk_counter <- chunk_counter + 1 + # Update counters + row_start <- best_fit + 1 + chunk_counter <- chunk_counter + 1 + } } - cat("Data frame exported to", chunk_counter - 1, "xpt files.\n") + message("Data frame exported to ", chunk_counter - 1, " xpt files.") } diff --git a/man/xportr_write.Rd b/man/xportr_write.Rd index 58dff195..f9b42b18 100644 --- a/man/xportr_write.Rd +++ b/man/xportr_write.Rd @@ -20,7 +20,8 @@ xportr_write( \item{path}{Path where transport file will be written. File name sans will be used as \code{xpt} name.} -\item{max_size_gb}{Maximum size in GB of the exported files.} +\item{max_size_gb}{Maximum size in GB of the exported file(s). If size of xpt file exceeds the specified maximum, +it will split the data frame into multiple exported chunk(s).} \item{metadata}{A data frame containing dataset. See 'Metadata' section for details.} diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index aad9e69e..23289f94 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -212,8 +212,33 @@ test_that("write Test 13: Capture errors by haven and report them as such", { ) }) -## Test 14: xportr_write: `max_size_gb` is used to split data frame into specified maximum file size ---- -test_that("write Test 14: `max_size_gb` is used to split data frame into specified maximum file size", { +## Test 14: xportr_write: if file size is less than `max_size_gb`, export only one file ---- +test_that("write Test 14: if file size is less than `max_size_gb`, export only one file", { + adlb <- pharmaverseadam::adlb + + tmpdir <- tempdir() + + # 5 GB + max_size_gb <- 5 + + expect_message( + xportr_write(adlb, + path = paste0(tmpdir, "/adlb.xpt"), + domain = "adlb", + max_size_gb = max_size_gb, + strict_checks = FALSE + ), + "Data frame exported to 1 xpt files." + ) + + expect_true( + file.exists(file.path(tmpdir, "adlb.xpt")), + file.info(file.path(tmpdir, "adlb.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE)) + ) +}) + +## Test 15: xportr_write: `max_size_gb` is used to split data frame into specified maximum file size ---- +test_that("write Test 15: `max_size_gb` is used to split data frame into specified maximum file size", { adlb <- pharmaverseadam::adlb tmpdir <- tempdir() @@ -221,11 +246,14 @@ test_that("write Test 14: `max_size_gb` is used to split data frame into specifi # 20 mb max_size_gb <- 20 / 1000 - xportr_write(adlb, - path = paste0(tmpdir, "/adlb.xpt"), - domain = "adlb", - max_size_gb = max_size_gb, - strict_checks = FALSE + expect_message( + xportr_write(adlb, + path = paste0(tmpdir, "/adlb.xpt"), + domain = "adlb", + max_size_gb = max_size_gb, + strict_checks = FALSE + ), + "Data frame exported to 5 xpt files." ) expect_true( @@ -254,7 +282,8 @@ test_that("write Test 14: `max_size_gb` is used to split data frame into specifi ) }) -## Test 15: xportr_write: Large file sizes are reported and warned ---- + +## Test 16: xportr_write: Large file sizes are reported and warned ---- test_that("write Test 15: Large file sizes are reported and warned", { skip_if_not(test_large_files) tmpdir <- tempdir()