-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
deal with files with only a header line and no data without error (#75)
closes #73 supersedes #74 This PR contains a lot to get to the root of the problem and not duplicate code bits that are used in multiple places. 1. Three new, unexported, utility functions are added (with tests) `has_header()`, `determine_header_types()`, and `read_first_line()`, and they are utilized in other functions. 2. `guess_numerical_mark()` is made more robust so that it will not error on a file with a single line (either header only or data only), and tests are added to keep it that way. One class of problem that I don't have a solution for is a CSV file with one line of data (no header) that has a numerical mark (used in an unquoted numeric) that collides with the delimiter, e.g. `XS1088274672,1,000.34,USD`. There is ambiguity in there that I can't resolve. 3. `read_portfolio_csv()` is made more robust so that it will not error on a file with only a header and no data (returns as expected NA) and tests for that are added. 4. Also introduced the idea of optionally passing `encoding` and `delimiter` to these utility functions so that they do not have to be determined so many times during the process of reading a portfolio CSV. Eventually, this concept should spread to other functions.
- Loading branch information
Showing
10 changed files
with
229 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
determine_header_types <- function(filepath, encoding = NULL, delimiter = NULL) { | ||
if (!is_text_file(filepath)) { | ||
return(NA) | ||
} | ||
|
||
if (is.null(encoding)) { | ||
encoding <- guess_file_encoding(filepath) | ||
} | ||
|
||
if (is.null(delimiter)) { | ||
delimiter <- guess_delimiter(filepath) | ||
} | ||
|
||
if (any(is.na(c(encoding, delimiter)))) { | ||
return(NA) | ||
} | ||
|
||
first_line <- | ||
read_first_line( | ||
filepath = filepath, | ||
encoding = encoding, | ||
delimiter = delimiter | ||
) | ||
|
||
vapply( | ||
X = readr::spec(first_line)$cols, | ||
FUN = function(x) { | ||
sub( | ||
pattern = "^collector_", | ||
replacement = "", | ||
x = class(x)[[1]] | ||
) | ||
}, | ||
FUN.VALUE = character(1), | ||
USE.NAMES = FALSE | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
has_header <- function(filepath, encoding = NULL, delimiter = NULL) { | ||
if (!is_text_file(filepath)) { | ||
return(NA) | ||
} | ||
|
||
if (is.null(encoding)) { | ||
encoding <- guess_file_encoding(filepath) | ||
} | ||
|
||
if (is.null(delimiter)) { | ||
delimiter <- guess_delimiter(filepath) | ||
} | ||
|
||
if (any(is.na(c(encoding, delimiter)))) { | ||
return(NA) | ||
} | ||
|
||
header_types <- | ||
determine_header_types( | ||
filepath = filepath, | ||
encoding = encoding, | ||
delimiter = delimiter | ||
) | ||
|
||
if (all(header_types %in% c("character", "logical"))) { | ||
has_header <- TRUE | ||
} else { | ||
has_header <- FALSE | ||
} | ||
|
||
has_header | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
read_first_line <- function(filepath, encoding = NULL, delimiter = NULL) { | ||
if (!is_text_file(filepath)) { | ||
return(NA) | ||
} | ||
|
||
if (is.null(encoding)) { | ||
encoding <- guess_file_encoding(filepath) | ||
} | ||
|
||
if (is.null(delimiter)) { | ||
delimiter <- guess_delimiter(filepath) | ||
} | ||
|
||
if (any(is.na(c(encoding, delimiter)))) { | ||
return(NA) | ||
} | ||
|
||
readr::read_delim( | ||
file = filepath, | ||
delim = delimiter, | ||
n_max = 1L, | ||
locale = readr::locale(encoding = encoding), | ||
col_names = FALSE, | ||
show_col_types = FALSE, | ||
progress = FALSE | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
test_that("properly identifies header types of standard portfolio CSV", { | ||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) | ||
expect_identical(determine_header_types(csv_file), rep("character", 5)) | ||
|
||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('isin,currency,market_value\nXS1088274672,1000.34,USD', csv_file) | ||
expect_identical(determine_header_types(csv_file), rep("character", 3)) | ||
}) | ||
|
||
test_that("properly identifies header types with a numeric header", { | ||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('isin,123', csv_file) | ||
expect_identical(determine_header_types(csv_file), c("character", "double")) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
test_that("identifies a portfolio CSV with only a header but no data", { | ||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('investor_name,portfolio_name,isin,currency,market_value', csv_file) | ||
expect_true(has_header(csv_file)) | ||
}) | ||
|
||
test_that("identifies a portfolio CSV with only data and no header", { | ||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('Investor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) | ||
expect_false(has_header(csv_file)) | ||
}) | ||
|
||
test_that("identifies a portfolio CSV with a header and data", { | ||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) | ||
expect_true(has_header(csv_file)) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
test_that("properly reads first line of standard portfolio CSV", { | ||
portfolio <- | ||
data.frame( | ||
isin = "", | ||
currency = "USD", | ||
market_value = 1000.34 | ||
) | ||
|
||
expected_result <- | ||
tibble::tibble( | ||
"X1" = "isin", | ||
"X2" = "currency", | ||
"X3" = "market_value" | ||
) | ||
|
||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
readr::write_csv(portfolio, file = csv_file) | ||
expect_identical(read_first_line(csv_file), expected_result) | ||
}) | ||
|
||
test_that("properly reads first line of CSV with only a header", { | ||
expected_result <- | ||
tibble::tibble( | ||
"X1" = "isin", | ||
"X2" = "currency", | ||
"X3" = "market_value" | ||
) | ||
|
||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines('isin,currency,market_value', csv_file) | ||
expect_identical(read_first_line(csv_file), expected_result) | ||
}) | ||
|
||
test_that("properly reads first line of CSV with no header", { | ||
expected_result <- | ||
tibble::tibble( | ||
"X1" = "XS1088274672", | ||
"X2" = 1000.34, | ||
"X3" = "USD" | ||
) | ||
|
||
csv_file <- withr::local_tempfile(fileext = ".csv") | ||
writeLines("XS1088274672,1000.34,USD", csv_file) | ||
expect_identical(read_first_line(csv_file), expected_result) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters