diff --git a/R/has_newline_at_end.R b/R/has_newline_at_end.R index 1be6750..c8babd7 100644 --- a/R/has_newline_at_end.R +++ b/R/has_newline_at_end.R @@ -1,33 +1,61 @@ +# Determine if file has trailing newline +# returns TRUE if file has newline at end +# returns FALSE if file has no newline at end +# returns NA if file is not accessible or is not a text file +file_has_newline_at_end <- function(filepath) { + stopifnot( + "`filepath` must be length 1" = length(filepath) == 1L + ) + if (!is_file_accessible(filepath) || !is_text_file(filepath)) { + return(NA) + } + if (.Platform$OS.type == "unix") { + newlines_in_final_char <- system( + command = paste( + "tail -c1 ", filepath, # tail -c1 returns last 1 byte + " | ", # pipe to next command + "wc -l" # wc -l returns number of newline characters in input + ), + intern = TRUE + ) + has_trailing_newline <- as.logical(as.integer(newlines_in_final_char)) + } else { + # Windows does not have a POSIX compliant `tail` command + # so we use builtin R functionality, which is slower + con <- file(filepath, "rb") + on.exit(close(con)) + + not_at_end <- TRUE + chars <- "" + while (not_at_end) { + prev_chars <- chars + chars <- readChar(con, nchars = 2048L, useBytes = TRUE) + if (length(chars) == 0L) { + not_at_end <- FALSE + } + } + + has_trailing_newline <- grepl( + # using [\n] instead of [\n\r] because modern systems use either \n or + # \r\n as newline, both of which match \n as last character. + "[\n]$", iconv(prev_chars, to = "UTF-8", sub = "") + ) + + } + + return(has_trailing_newline) +} + has_newline_at_end <- function(filepaths) { filepaths <- simplify_if_one_col_df(filepaths) - stopifnot("`filepaths` must be a character vector" = typeof(filepaths) == "character") + stopifnot( + "`filepaths` must be a character vector" = typeof(filepaths) == "character" + ) filepaths <- canonize_path(filepaths) vapply( X = filepaths, - FUN = function(filepath) { - if (!is_file_accessible(filepath) || !is_text_file(filepath)) { - return(NA) - } - - con <- file(filepath, "rb") - on.exit(close(con)) - - not_at_end <- TRUE - chars <- "" - while (not_at_end) { - prev_chars <- chars - chars <- readChar(con, nchars = 2048L, useBytes = TRUE) - if (length(chars) == 0L) { - not_at_end <- FALSE - } - } - - # using [\n] instead of [\n\r] because modern systems use either \n or - # \r\n as newline, both of which match \n as last character. - # \r as last character should not be recognized as trailing newline. - grepl("[\n]$", iconv(prev_chars, to = "UTF-8", sub = "")) - }, + FUN = file_has_newline_at_end, FUN.VALUE = logical(1), USE.NAMES = FALSE ) diff --git a/tests/testthat/test-has_newline_at_end.R b/tests/testthat/test-has_newline_at_end.R index 32e0571..decfbfd 100644 --- a/tests/testthat/test-has_newline_at_end.R +++ b/tests/testthat/test-has_newline_at_end.R @@ -33,6 +33,13 @@ test_that("has_newline_at_end identifies a binary file", { expect_equal(has_newline_at_end(binary), NA) }) +test_that("has_newline_at_end identifies an empty file", { + empty <- withr::local_tempfile() + write.table(data.frame(), file = empty, col.names = FALSE) + expect_identical(as.integer(file.size(empty)), 0L) + expect_identical(has_newline_at_end(empty), NA) +}) + test_that("has_newline_at_end identifies an inaccessible filepath", { filepath <- "non-existant-file.txt" expect_false(file.exists(filepath)) @@ -53,3 +60,13 @@ test_that("has_newline_at_end behaves as expected when multiple filepaths are pa expect_equal(has_newline_at_end(c(no_newline, no_newline)), c(FALSE, FALSE)) expect_equal(has_newline_at_end(c(newline, newline)), c(TRUE, TRUE)) }) + +test_that("file_has_newline_at_end expects a single file", { + f1 <- withr::local_tempfile() + f2 <- withr::local_tempfile() + expect_error( + file_has_newline_at_end(c(f1, f2)), + "`filepath` must be length 1" + ) +}) +