Skip to content

Commit

Permalink
deal with files with only a header line and no data without error (#75)
Browse files Browse the repository at this point in the history
closes #73
supersedes #74 

This PR contains a lot to get to the root of the problem and not
duplicate code bits that are used in multiple places.

1. Three new, unexported, utility functions are added (with tests)
`has_header()`, `determine_header_types()`, and `read_first_line()`, and
they are utilized in other functions.
2. `guess_numerical_mark()` is made more robust so that it will not
error on a file with a single line (either header only or data only),
and tests are added to keep it that way. One class of problem that I
don't have a solution for is a CSV file with one line of data (no
header) that has a numerical mark (used in an unquoted numeric) that
collides with the delimiter, e.g. `XS1088274672,1,000.34,USD`. There is
ambiguity in there that I can't resolve.
3. `read_portfolio_csv()` is made more robust so that it will not error
on a file with only a header and no data (returns as expected NA) and
tests for that are added.
4. Also introduced the idea of optionally passing `encoding` and
`delimiter` to these utility functions so that they do not have to be
determined so many times during the process of reading a portfolio CSV.
Eventually, this concept should spread to other functions.
  • Loading branch information
cjyetman authored Jan 31, 2024
1 parent 6d7e661 commit 5a449a1
Show file tree
Hide file tree
Showing 10 changed files with 229 additions and 20 deletions.
37 changes: 37 additions & 0 deletions R/determine_header_types.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
determine_header_types <- function(filepath, encoding = NULL, delimiter = NULL) {
if (!is_text_file(filepath)) {
return(NA)
}

if (is.null(encoding)) {
encoding <- guess_file_encoding(filepath)
}

if (is.null(delimiter)) {
delimiter <- guess_delimiter(filepath)
}

if (any(is.na(c(encoding, delimiter)))) {
return(NA)
}

first_line <-
read_first_line(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

vapply(
X = readr::spec(first_line)$cols,
FUN = function(x) {
sub(
pattern = "^collector_",
replacement = "",
x = class(x)[[1]]
)
},
FUN.VALUE = character(1),
USE.NAMES = FALSE
)
}
33 changes: 20 additions & 13 deletions R/determine_headers.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,29 @@ determine_headers <- function(filepath) {
return(NA_character_)
}

locale <-
readr::locale(
encoding = encoding
file_has_header <-
has_header(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

first_line <- readr::read_delim(filepath, delim = delimiter, n_max = 1L, locale = locale, col_names = FALSE, show_col_types = FALSE, progress = FALSE)
header_types <- vapply(X = readr::spec(first_line)$cols, FUN = function(x) sub("^collector_", "", class(x)[[1]]), FUN.VALUE = character(1), USE.NAMES = FALSE)
first_line <-
read_first_line(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

if (all(header_types %in% c("character", "logical"))) {
has_header <- TRUE
} else {
has_header <- FALSE
}
header_types <-
determine_header_types(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

if (!has_header) {
num_of_cols <- length(header_types)
if (!file_has_header) {
num_of_cols <- ncol(first_line)
if (num_of_cols == 3) {
col_names <- c("isin", "market_value", "currency")
names(col_names) <- col_names
Expand Down Expand Up @@ -92,7 +99,7 @@ determine_headers <- function(filepath) {
if (not_portfolio_csv) {
return(NA_character_)
}
if (!has_header) {
if (!file_has_header) {
return(col_names)
}

Expand Down
20 changes: 14 additions & 6 deletions R/guess_numerical_mark.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,25 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
encoding <- guess_file_encoding(filepath)
delimiter <- guess_delimiter(filepath)

cust_locale <-
readr::locale(
encoding = encoding
# return NA if file only has a header row and no data
file_has_header <-
has_header(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

only_one_line <- length(readLines(con = filepath, n = 2L)) < 2

if (file_has_header && only_one_line) {
return(NA_character_)
}

# determine appropriate column
headers <- determine_headers(filepath)

if ("market_value" %in% names(headers)) {
num_col_idx <- match("market_value", names(headers))
has_header_row <- TRUE
} else {
warning("`market_value` column could not be determined")
return(NA_character_)
Expand All @@ -52,12 +60,12 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
readr::read_delim(
file = filepath,
delim = delimiter,
locale = cust_locale,
locale = readr::locale(encoding = encoding),
trim_ws = TRUE,
col_types = readr::cols(.default = "c"),
col_names = FALSE,
col_select = dplyr::all_of(num_col_idx),
skip = dplyr::if_else(has_header_row, 1L, 0L),
skip = dplyr::if_else(file_has_header, 1L, 0L),
progress = FALSE,
show_col_types = FALSE
)
Expand Down
32 changes: 32 additions & 0 deletions R/has_header.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
has_header <- function(filepath, encoding = NULL, delimiter = NULL) {
if (!is_text_file(filepath)) {
return(NA)
}

if (is.null(encoding)) {
encoding <- guess_file_encoding(filepath)
}

if (is.null(delimiter)) {
delimiter <- guess_delimiter(filepath)
}

if (any(is.na(c(encoding, delimiter)))) {
return(NA)
}

header_types <-
determine_header_types(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

if (all(header_types %in% c("character", "logical"))) {
has_header <- TRUE
} else {
has_header <- FALSE
}

has_header
}
27 changes: 27 additions & 0 deletions R/read_first_line.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
read_first_line <- function(filepath, encoding = NULL, delimiter = NULL) {
if (!is_text_file(filepath)) {
return(NA)
}

if (is.null(encoding)) {
encoding <- guess_file_encoding(filepath)
}

if (is.null(delimiter)) {
delimiter <- guess_delimiter(filepath)
}

if (any(is.na(c(encoding, delimiter)))) {
return(NA)
}

readr::read_delim(
file = filepath,
delim = delimiter,
n_max = 1L,
locale = readr::locale(encoding = encoding),
col_names = FALSE,
show_col_types = FALSE,
progress = FALSE
)
}
9 changes: 8 additions & 1 deletion R/read_portfolio_csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {

headers <- determine_headers(filepath)

file_has_header <-
has_header(
filepath = filepath,
encoding = encoding,
delimiter = delimiter
)

if (length(headers) == 3) {
col_types <-
readr::cols(
Expand All @@ -87,7 +94,7 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {
portfolio_df <-
readr::read_delim(
file = filepath,
skip = 1L,
skip = dplyr::if_else(file_has_header, 1L, 0L),
col_names = names(headers),
col_types = col_types,
locale = locale,
Expand Down
15 changes: 15 additions & 0 deletions tests/testthat/test-determine_header_types.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
test_that("properly identifies header types of standard portfolio CSV", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
expect_identical(determine_header_types(csv_file), rep("character", 5))

csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('isin,currency,market_value\nXS1088274672,1000.34,USD', csv_file)
expect_identical(determine_header_types(csv_file), rep("character", 3))
})

test_that("properly identifies header types with a numeric header", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('isin,123', csv_file)
expect_identical(determine_header_types(csv_file), c("character", "double"))
})
17 changes: 17 additions & 0 deletions tests/testthat/test-has_header.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
test_that("identifies a portfolio CSV with only a header but no data", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('investor_name,portfolio_name,isin,currency,market_value', csv_file)
expect_true(has_header(csv_file))
})

test_that("identifies a portfolio CSV with only data and no header", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('Investor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
expect_false(has_header(csv_file))
})

test_that("identifies a portfolio CSV with a header and data", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
expect_true(has_header(csv_file))
})
45 changes: 45 additions & 0 deletions tests/testthat/test-read_first_line.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
test_that("properly reads first line of standard portfolio CSV", {
portfolio <-
data.frame(
isin = "",
currency = "USD",
market_value = 1000.34
)

expected_result <-
tibble::tibble(
"X1" = "isin",
"X2" = "currency",
"X3" = "market_value"
)

csv_file <- withr::local_tempfile(fileext = ".csv")
readr::write_csv(portfolio, file = csv_file)
expect_identical(read_first_line(csv_file), expected_result)
})

test_that("properly reads first line of CSV with only a header", {
expected_result <-
tibble::tibble(
"X1" = "isin",
"X2" = "currency",
"X3" = "market_value"
)

csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines('isin,currency,market_value', csv_file)
expect_identical(read_first_line(csv_file), expected_result)
})

test_that("properly reads first line of CSV with no header", {
expected_result <-
tibble::tibble(
"X1" = "XS1088274672",
"X2" = 1000.34,
"X3" = "USD"
)

csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines("XS1088274672,1000.34,USD", csv_file)
expect_identical(read_first_line(csv_file), expected_result)
})
14 changes: 14 additions & 0 deletions tests/testthat/test-read_portfolio_csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,17 @@ test_that("reads a portfolio CSV with numeric names as characters", {
expect_type(result$investor_name, "character")
expect_type(result$portfolio_name, "character")
})

test_that("deals with a portfolio CSV with only a header appropriately", {
csv_file <- withr::local_tempfile(fileext = ".csv")
readr::write_csv(portfolio_min[0, ], file = csv_file)
result <- read_portfolio_csv(csv_file)
expect_equal(result, NA)
})

test_that("deals with a portfolio CSV with no header and only one row of data", {
csv_file <- withr::local_tempfile(fileext = ".csv")
writeLines(paste(portfolio_min, collapse = ","), csv_file)
result <- read_portfolio_csv(csv_file)
expect_equal(unlist(result), unlist(portfolio_min))
})

0 comments on commit 5a449a1

Please sign in to comment.