Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/newline check #58

Merged
merged 11 commits into from
Jan 30, 2024
76 changes: 52 additions & 24 deletions R/has_newline_at_end.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,61 @@
# Determine if file has trailing newline
# returns TRUE if file has newline at end
# returns FALSE if file has no newline at end
# returns NA if file is not accessible or is not a text file
file_has_newline_at_end <- function(filepath) {
stopifnot(
"`filepath` must be length 1" = length(filepath) == 1L
)
if (!is_file_accessible(filepath) || !is_text_file(filepath)) {
return(NA)
}
if (.Platform$OS.type == "unix") {
newlines_in_final_char <- system(
command = paste(
"tail -c1 ", filepath, # tail -c1 returns last 1 byte
" | ", # pipe to next command
"wc -l" # wc -l returns number of newline characters in input
),
intern = TRUE
)
has_trailing_newline <- as.logical(as.integer(newlines_in_final_char))
} else {
# Windows does not have a POSIX compliant `tail` command
# so we use builtin R functionality, which is slower
con <- file(filepath, "rb")
on.exit(close(con))

not_at_end <- TRUE
chars <- ""
while (not_at_end) {
prev_chars <- chars
chars <- readChar(con, nchars = 2048L, useBytes = TRUE)
if (length(chars) == 0L) {
not_at_end <- FALSE
}
}

has_trailing_newline <- grepl(
# using [\n] instead of [\n\r] because modern systems use either \n or
# \r\n as newline, both of which match \n as last character.
"[\n]$", iconv(prev_chars, to = "UTF-8", sub = "")
)

}

return(has_trailing_newline)
}

has_newline_at_end <- function(filepaths) {
filepaths <- simplify_if_one_col_df(filepaths)
stopifnot("`filepaths` must be a character vector" = typeof(filepaths) == "character")
stopifnot(
"`filepaths` must be a character vector" = typeof(filepaths) == "character"
)
filepaths <- canonize_path(filepaths)

vapply(
X = filepaths,
FUN = function(filepath) {
if (!is_file_accessible(filepath) || !is_text_file(filepath)) {
return(NA)
}

con <- file(filepath, "rb")
on.exit(close(con))

not_at_end <- TRUE
chars <- ""
while (not_at_end) {
prev_chars <- chars
chars <- readChar(con, nchars = 2048L, useBytes = TRUE)
if (length(chars) == 0L) {
not_at_end <- FALSE
}
}

# using [\n] instead of [\n\r] because modern systems use either \n or
# \r\n as newline, both of which match \n as last character.
# \r as last character should not be recognized as trailing newline.
grepl("[\n]$", iconv(prev_chars, to = "UTF-8", sub = ""))
},
FUN = file_has_newline_at_end,
FUN.VALUE = logical(1),
USE.NAMES = FALSE
)
Expand Down
17 changes: 17 additions & 0 deletions tests/testthat/test-has_newline_at_end.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ test_that("has_newline_at_end identifies a binary file", {
expect_equal(has_newline_at_end(binary), NA)
})

test_that("has_newline_at_end identifies an empty file", {
empty <- withr::local_tempfile()
write.table(data.frame(), file = empty, col.names = FALSE)
expect_identical(as.integer(file.size(empty)), 0L)
expect_identical(has_newline_at_end(empty), NA)
})

test_that("has_newline_at_end identifies an inaccessible filepath", {
filepath <- "non-existant-file.txt"
expect_false(file.exists(filepath))
Expand All @@ -53,3 +60,13 @@ test_that("has_newline_at_end behaves as expected when multiple filepaths are pa
expect_equal(has_newline_at_end(c(no_newline, no_newline)), c(FALSE, FALSE))
expect_equal(has_newline_at_end(c(newline, newline)), c(TRUE, TRUE))
})

test_that("file_has_newline_at_end expects a single file", {
f1 <- withr::local_tempfile()
f2 <- withr::local_tempfile()
expect_error(
file_has_newline_at_end(c(f1, f2)),
"`filepath` must be length 1"
)
})

Loading