deal with files with only a header line and no data without error (#75)

closes #73 supersedes #74 This PR contains a lot to get to the root of the problem and not duplicate code bits that are used in multiple places. 1. Three new, unexported, utility functions are added (with tests) `has_header()`, `determine_header_types()`, and `read_first_line()`, and they are utilized in other functions. 2. `guess_numerical_mark()` is made more robust so that it will not error on a file with a single line (either header only or data only), and tests are added to keep it that way. One class of problem that I don't have a solution for is a CSV file with one line of data (no header) that has a numerical mark (used in an unquoted numeric) that collides with the delimiter, e.g. `XS1088274672,1,000.34,USD`. There is ambiguity in there that I can't resolve. 3. `read_portfolio_csv()` is made more robust so that it will not error on a file with only a header and no data (returns as expected NA) and tests for that are added. 4. Also introduced the idea of optionally passing `encoding` and `delimiter` to these utility functions so that they do not have to be determined so many times during the process of reading a portfolio CSV. Eventually, this concept should spread to other functions.
RMI-PACTA · Jan 31, 2024 · 5a449a1 · 5a449a1
1 parent 6d7e661
commit 5a449a1
Show file tree

Hide file tree

Showing 10 changed files with 229 additions and 20 deletions.
diff --git a/R/determine_header_types.R b/R/determine_header_types.R
@@ -0,0 +1,37 @@
+determine_header_types <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  first_line <-
+    read_first_line(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
+
+  vapply(
+    X = readr::spec(first_line)$cols,
+    FUN = function(x) {
+      sub(
+        pattern = "^collector_",
+        replacement = "",
+        x = class(x)[[1]]
+      )
+    },
+    FUN.VALUE = character(1),
+    USE.NAMES = FALSE
+  )
+}
diff --git a/R/determine_headers.R b/R/determine_headers.R
@@ -27,22 +27,29 @@ determine_headers <- function(filepath) {
     return(NA_character_)
   }
 
-  locale <-
-    readr::locale(
-      encoding = encoding
+  file_has_header <-
+    has_header(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
     )
 
-  first_line <- readr::read_delim(filepath, delim = delimiter, n_max = 1L, locale = locale, col_names = FALSE, show_col_types = FALSE, progress = FALSE)
-  header_types <- vapply(X = readr::spec(first_line)$cols, FUN = function(x) sub("^collector_", "", class(x)[[1]]), FUN.VALUE = character(1), USE.NAMES = FALSE)
+  first_line <-
+    read_first_line(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
 
-  if (all(header_types %in% c("character", "logical"))) {
-    has_header <- TRUE
-  } else {
-    has_header <- FALSE
-  }
+  header_types <-
+    determine_header_types(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
 
-  if (!has_header) {
-    num_of_cols <- length(header_types)
+  if (!file_has_header) {
+    num_of_cols <- ncol(first_line)
     if (num_of_cols == 3) {
       col_names <- c("isin", "market_value", "currency")
       names(col_names) <- col_names
@@ -92,7 +99,7 @@ determine_headers <- function(filepath) {
   if (not_portfolio_csv) {
     return(NA_character_)
   }
-  if (!has_header) {
+  if (!file_has_header) {
     return(col_names)
   }
 

diff --git a/R/guess_numerical_mark.R b/R/guess_numerical_mark.R
@@ -31,17 +31,25 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
       encoding <- guess_file_encoding(filepath)
       delimiter <- guess_delimiter(filepath)
 
-      cust_locale <-
-        readr::locale(
-          encoding = encoding
+      # return NA if file only has a header row and no data
+      file_has_header <-
+        has_header(
+          filepath = filepath,
+          encoding = encoding,
+          delimiter = delimiter
         )
 
+      only_one_line <- length(readLines(con = filepath, n = 2L)) < 2
+
+      if (file_has_header && only_one_line) {
+        return(NA_character_)
+      }
+
       # determine appropriate column
       headers <- determine_headers(filepath)
 
       if ("market_value" %in% names(headers)) {
         num_col_idx <- match("market_value", names(headers))
-        has_header_row <- TRUE
       } else {
         warning("`market_value` column could not be determined")
         return(NA_character_)
@@ -52,12 +60,12 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
         readr::read_delim(
           file = filepath,
           delim = delimiter,
-          locale = cust_locale,
+          locale = readr::locale(encoding = encoding),
           trim_ws = TRUE,
           col_types = readr::cols(.default = "c"),
           col_names = FALSE,
           col_select = dplyr::all_of(num_col_idx),
-          skip = dplyr::if_else(has_header_row, 1L, 0L),
+          skip = dplyr::if_else(file_has_header, 1L, 0L),
           progress = FALSE,
           show_col_types = FALSE
         )

diff --git a/R/has_header.R b/R/has_header.R
@@ -0,0 +1,32 @@
+has_header <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  header_types <-
+    determine_header_types(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
+
+  if (all(header_types %in% c("character", "logical"))) {
+    has_header <- TRUE
+  } else {
+    has_header <- FALSE
+  }
+
+  has_header
+}
diff --git a/R/read_first_line.R b/R/read_first_line.R
@@ -0,0 +1,27 @@
+read_first_line <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  readr::read_delim(
+    file = filepath,
+    delim = delimiter,
+    n_max = 1L,
+    locale = readr::locale(encoding = encoding),
+    col_names = FALSE,
+    show_col_types = FALSE,
+    progress = FALSE
+  )
+}
diff --git a/R/read_portfolio_csv.R b/R/read_portfolio_csv.R
@@ -64,6 +64,13 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {
 
       headers <- determine_headers(filepath)
 
+      file_has_header <-
+        has_header(
+          filepath = filepath,
+          encoding = encoding,
+          delimiter = delimiter
+        )
+
       if (length(headers) == 3) {
         col_types <-
           readr::cols(
@@ -87,7 +94,7 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {
       portfolio_df <-
         readr::read_delim(
           file = filepath,
-          skip = 1L,
+          skip = dplyr::if_else(file_has_header, 1L, 0L),
           col_names = names(headers),
           col_types = col_types,
           locale = locale,

diff --git a/tests/testthat/test-determine_header_types.R b/tests/testthat/test-determine_header_types.R
@@ -0,0 +1,15 @@
+test_that("properly identifies header types of standard portfolio CSV", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_identical(determine_header_types(csv_file), rep("character", 5))
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,currency,market_value\nXS1088274672,1000.34,USD', csv_file)
+  expect_identical(determine_header_types(csv_file), rep("character", 3))
+})
+
+test_that("properly identifies header types with a numeric header", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,123', csv_file)
+  expect_identical(determine_header_types(csv_file), c("character", "double"))
+})
diff --git a/tests/testthat/test-has_header.R b/tests/testthat/test-has_header.R
@@ -0,0 +1,17 @@
+test_that("identifies a portfolio CSV with only a header but no data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value', csv_file)
+  expect_true(has_header(csv_file))
+})
+
+test_that("identifies a portfolio CSV with only data and no header", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('Investor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_false(has_header(csv_file))
+})
+
+test_that("identifies a portfolio CSV with a header and data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_true(has_header(csv_file))
+})
diff --git a/tests/testthat/test-read_first_line.R b/tests/testthat/test-read_first_line.R
@@ -0,0 +1,45 @@
+test_that("properly reads first line of standard portfolio CSV", {
+  portfolio <-
+    data.frame(
+      isin = "",
+      currency = "USD",
+      market_value = 1000.34
+    )
+
+  expected_result <-
+    tibble::tibble(
+      "X1" = "isin",
+      "X2" = "currency",
+      "X3" = "market_value"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  readr::write_csv(portfolio, file = csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
+
+test_that("properly reads first line of CSV with only a header", {
+  expected_result <-
+    tibble::tibble(
+      "X1" = "isin",
+      "X2" = "currency",
+      "X3" = "market_value"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,currency,market_value', csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
+
+test_that("properly reads first line of CSV with no header", {
+  expected_result <-
+    tibble::tibble(
+      "X1" = "XS1088274672",
+      "X2" = 1000.34,
+      "X3" = "USD"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines("XS1088274672,1000.34,USD", csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
diff --git a/tests/testthat/test-read_portfolio_csv.R b/tests/testthat/test-read_portfolio_csv.R
@@ -186,3 +186,17 @@ test_that("reads a portfolio CSV with numeric names as characters", {
   expect_type(result$investor_name, "character")
   expect_type(result$portfolio_name, "character")
 })
+
+test_that("deals with a portfolio CSV with only a header appropriately", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  readr::write_csv(portfolio_min[0, ], file = csv_file)
+  result <- read_portfolio_csv(csv_file)
+  expect_equal(result, NA)
+})
+
+test_that("deals with a portfolio CSV with no header and only one row of data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines(paste(portfolio_min, collapse = ","), csv_file)
+  result <- read_portfolio_csv(csv_file)
+  expect_equal(unlist(result), unlist(portfolio_min))
+})