Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrian Chan committed Sep 6, 2024
1 parent e1213d4 commit 8c5a2c7
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 42 deletions.
6 changes: 4 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# xportr 0.4.0
# xportr 0.5.0

## xportr development version
## New Feature

* New argument in `xportr_write()` allows users to specify the maximum file size (in GB) of their exported xpt files. (#268)

# xportr 0.4.0

## New Features

* All core functions can be run together by using new function `xportr()` (#137)
Expand Down
83 changes: 52 additions & 31 deletions R/write.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#' @param .df A data frame to write.
#' @param path Path where transport file will be written. File name sans will be
#' used as `xpt` name.
#' @param max_size_gb Maximum size in GB of the exported files.
#' @param max_size_gb Maximum size in GB of the exported file(s). If size of xpt file exceeds the specified maximum,
#' it will split the data frame into multiple exported chunk(s).
#' @param label `r lifecycle::badge("deprecated")` Previously used to to set the Dataset label.
#' Use the `metadata` argument to set the dataset label.
#' @param strict_checks If TRUE, xpt validation will report errors and not write
Expand Down Expand Up @@ -75,6 +76,7 @@ xportr_write <- function(.df,

assert_data_frame(.df)
assert_string(path)
checkmate::assert_numeric(max_size_gb, null.ok = TRUE)
assert_metadata(metadata, null.ok = TRUE)
assert_logical(strict_checks)

Expand All @@ -90,6 +92,10 @@ xportr_write <- function(.df,
assert(".df file name must be 8 characters or less.", .var.name = "path")
}

if (is.numeric(max_size_gb) && max_size_gb <= 0) {
assert("max_size_gb must be NULL or a numeric value greater than 0.", .var.name = "max_size_gb")
}

checks <- xpt_validate(.df)

if (stringr::str_detect(name, "[^a-zA-Z0-9]")) {
Expand Down Expand Up @@ -160,43 +166,58 @@ export_to_xpt <- function(.df, path, max_size_gb, file_prefix) {
# Convert GB to bytes
max_size_bytes <- max_size_gb * 1000^3

total_rows <- nrow(.df)
chunk_counter <- 1
row_start <- 1
temp_file <- tempfile()
write_xpt(.df, temp_file)

file_size <- file.info(temp_file)$size
unlink(temp_file) # Clean up the temporary file

dir_path <- dirname(path)

while (row_start <= total_rows) {
# Binary search to find the maximum number of rows that fit within the size limit
low <- row_start
high <- total_rows
best_fit <- row_start

while (low <= high) {
mid <- floor((low + high) / 2)
temp_file <- tempfile()
write_xpt(.df[row_start:mid, ], temp_file)
file_size <- file.info(temp_file)$size

if (file_size <= max_size_bytes) {
best_fit <- mid
low <- mid + 1
} else {
high <- mid - 1
if (file_size <= max_size_bytes) {
chunk_counter <- 2
file_name <- sprintf("%s.xpt", file_prefix)
path <- file.path(dir_path, file_name)
write_xpt(.df, path = path, version = 5, name = file_prefix)
} else {
total_rows <- nrow(.df)
row_start <- 1
chunk_counter <- 1

while (row_start <= total_rows) {
# Binary search to find the maximum number of rows that fit within the size limit
low <- row_start
high <- total_rows
best_fit <- row_start

while (low <= high) {
mid <- floor((low + high) / 2)
write_xpt(.df[row_start:mid, ], temp_file)
file_size <- file.info(temp_file)$size

if (file_size <= max_size_bytes) {
best_fit <- mid
low <- mid + 1
} else {
high <- mid - 1
}

unlink(temp_file) # Clean up the temporary file
}

unlink(temp_file) # Clean up the temporary file
}
# Write the best fitting chunk to the final file
chunk <- .df[row_start:best_fit, ]

# Write the best fitting chunk to the final file
file_name <- sprintf("%s%d.xpt", file_prefix, chunk_counter)
path <- file.path(dir_path, file_name)
file_name <- sprintf("%s%d.xpt", file_prefix, chunk_counter)
path <- file.path(dir_path, file_name)

write_xpt(.df[row_start:best_fit, ], path = path, version = 5, name = file_prefix)
write_xpt(chunk, path = path, version = 5, name = file_prefix)

# Update counters
row_start <- best_fit + 1
chunk_counter <- chunk_counter + 1
# Update counters
row_start <- best_fit + 1
chunk_counter <- chunk_counter + 1
}
}

cat("Data frame exported to", chunk_counter - 1, "xpt files.\n")
message("Data frame exported to ", chunk_counter - 1, " xpt files.")
}
3 changes: 2 additions & 1 deletion man/xportr_write.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 37 additions & 8 deletions tests/testthat/test-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,20 +212,48 @@ test_that("write Test 13: Capture errors by haven and report them as such", {
)
})

## Test 14: xportr_write: `max_size_gb` is used to split data frame into specified maximum file size ----
test_that("write Test 14: `max_size_gb` is used to split data frame into specified maximum file size", {
## Test 14: xportr_write: if file size is less than `max_size_gb`, export only one file ----
test_that("write Test 14: if file size is less than `max_size_gb`, export only one file", {
adlb <- pharmaverseadam::adlb

tmpdir <- tempdir()

# 5 GB
max_size_gb <- 5

expect_message(
xportr_write(adlb,
path = paste0(tmpdir, "/adlb.xpt"),
domain = "adlb",
max_size_gb = max_size_gb,
strict_checks = FALSE
),
"Data frame exported to 1 xpt files."
)

expect_true(
file.exists(file.path(tmpdir, "adlb.xpt")),
file.info(file.path(tmpdir, "adlb.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
)
})

## Test 15: xportr_write: `max_size_gb` is used to split data frame into specified maximum file size ----
test_that("write Test 15: `max_size_gb` is used to split data frame into specified maximum file size", {
adlb <- pharmaverseadam::adlb

tmpdir <- tempdir()

# 20 mb
max_size_gb <- 20 / 1000

xportr_write(adlb,
path = paste0(tmpdir, "/adlb.xpt"),
domain = "adlb",
max_size_gb = max_size_gb,
strict_checks = FALSE
expect_message(
xportr_write(adlb,
path = paste0(tmpdir, "/adlb.xpt"),
domain = "adlb",
max_size_gb = max_size_gb,
strict_checks = FALSE
),
"Data frame exported to 5 xpt files."
)

expect_true(
Expand Down Expand Up @@ -254,7 +282,8 @@ test_that("write Test 14: `max_size_gb` is used to split data frame into specifi
)
})

## Test 15: xportr_write: Large file sizes are reported and warned ----

## Test 16: xportr_write: Large file sizes are reported and warned ----
test_that("write Test 15: Large file sizes are reported and warned", {
skip_if_not(test_large_files)
tmpdir <- tempdir()
Expand Down

0 comments on commit 8c5a2c7

Please sign in to comment.