From 5a449a1f5485bca10c96fe38aa0b54c949e391bc Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Wed, 31 Jan 2024 12:31:59 +0100 Subject: [PATCH] deal with files with only a header line and no data without error (#75) closes #73 supersedes #74 This PR contains a lot to get to the root of the problem and not duplicate code bits that are used in multiple places. 1. Three new, unexported, utility functions are added (with tests) `has_header()`, `determine_header_types()`, and `read_first_line()`, and they are utilized in other functions. 2. `guess_numerical_mark()` is made more robust so that it will not error on a file with a single line (either header only or data only), and tests are added to keep it that way. One class of problem that I don't have a solution for is a CSV file with one line of data (no header) that has a numerical mark (used in an unquoted numeric) that collides with the delimiter, e.g. `XS1088274672,1,000.34,USD`. There is ambiguity in there that I can't resolve. 3. `read_portfolio_csv()` is made more robust so that it will not error on a file with only a header and no data (returns as expected NA) and tests for that are added. 4. Also introduced the idea of optionally passing `encoding` and `delimiter` to these utility functions so that they do not have to be determined so many times during the process of reading a portfolio CSV. Eventually, this concept should spread to other functions. --- R/determine_header_types.R | 37 ++++++++++++++++ R/determine_headers.R | 33 ++++++++------ R/guess_numerical_mark.R | 20 ++++++--- R/has_header.R | 32 ++++++++++++++ R/read_first_line.R | 27 ++++++++++++ R/read_portfolio_csv.R | 9 +++- tests/testthat/test-determine_header_types.R | 15 +++++++ tests/testthat/test-has_header.R | 17 ++++++++ tests/testthat/test-read_first_line.R | 45 ++++++++++++++++++++ tests/testthat/test-read_portfolio_csv.R | 14 ++++++ 10 files changed, 229 insertions(+), 20 deletions(-) create mode 100644 R/determine_header_types.R create mode 100644 R/has_header.R create mode 100644 R/read_first_line.R create mode 100644 tests/testthat/test-determine_header_types.R create mode 100644 tests/testthat/test-has_header.R create mode 100644 tests/testthat/test-read_first_line.R diff --git a/R/determine_header_types.R b/R/determine_header_types.R new file mode 100644 index 0000000..7bd66da --- /dev/null +++ b/R/determine_header_types.R @@ -0,0 +1,37 @@ +determine_header_types <- function(filepath, encoding = NULL, delimiter = NULL) { + if (!is_text_file(filepath)) { + return(NA) + } + + if (is.null(encoding)) { + encoding <- guess_file_encoding(filepath) + } + + if (is.null(delimiter)) { + delimiter <- guess_delimiter(filepath) + } + + if (any(is.na(c(encoding, delimiter)))) { + return(NA) + } + + first_line <- + read_first_line( + filepath = filepath, + encoding = encoding, + delimiter = delimiter + ) + + vapply( + X = readr::spec(first_line)$cols, + FUN = function(x) { + sub( + pattern = "^collector_", + replacement = "", + x = class(x)[[1]] + ) + }, + FUN.VALUE = character(1), + USE.NAMES = FALSE + ) +} diff --git a/R/determine_headers.R b/R/determine_headers.R index c96134c..5f12bb1 100644 --- a/R/determine_headers.R +++ b/R/determine_headers.R @@ -27,22 +27,29 @@ determine_headers <- function(filepath) { return(NA_character_) } - locale <- - readr::locale( - encoding = encoding + file_has_header <- + has_header( + filepath = filepath, + encoding = encoding, + delimiter = delimiter ) - first_line <- readr::read_delim(filepath, delim = delimiter, n_max = 1L, locale = locale, col_names = FALSE, show_col_types = FALSE, progress = FALSE) - header_types <- vapply(X = readr::spec(first_line)$cols, FUN = function(x) sub("^collector_", "", class(x)[[1]]), FUN.VALUE = character(1), USE.NAMES = FALSE) + first_line <- + read_first_line( + filepath = filepath, + encoding = encoding, + delimiter = delimiter + ) - if (all(header_types %in% c("character", "logical"))) { - has_header <- TRUE - } else { - has_header <- FALSE - } + header_types <- + determine_header_types( + filepath = filepath, + encoding = encoding, + delimiter = delimiter + ) - if (!has_header) { - num_of_cols <- length(header_types) + if (!file_has_header) { + num_of_cols <- ncol(first_line) if (num_of_cols == 3) { col_names <- c("isin", "market_value", "currency") names(col_names) <- col_names @@ -92,7 +99,7 @@ determine_headers <- function(filepath) { if (not_portfolio_csv) { return(NA_character_) } - if (!has_header) { + if (!file_has_header) { return(col_names) } diff --git a/R/guess_numerical_mark.R b/R/guess_numerical_mark.R index 53bd6e2..4f2c063 100644 --- a/R/guess_numerical_mark.R +++ b/R/guess_numerical_mark.R @@ -31,17 +31,25 @@ guess_numerical_mark <- function(filepaths, type = "decimal") { encoding <- guess_file_encoding(filepath) delimiter <- guess_delimiter(filepath) - cust_locale <- - readr::locale( - encoding = encoding + # return NA if file only has a header row and no data + file_has_header <- + has_header( + filepath = filepath, + encoding = encoding, + delimiter = delimiter ) + only_one_line <- length(readLines(con = filepath, n = 2L)) < 2 + + if (file_has_header && only_one_line) { + return(NA_character_) + } + # determine appropriate column headers <- determine_headers(filepath) if ("market_value" %in% names(headers)) { num_col_idx <- match("market_value", names(headers)) - has_header_row <- TRUE } else { warning("`market_value` column could not be determined") return(NA_character_) @@ -52,12 +60,12 @@ guess_numerical_mark <- function(filepaths, type = "decimal") { readr::read_delim( file = filepath, delim = delimiter, - locale = cust_locale, + locale = readr::locale(encoding = encoding), trim_ws = TRUE, col_types = readr::cols(.default = "c"), col_names = FALSE, col_select = dplyr::all_of(num_col_idx), - skip = dplyr::if_else(has_header_row, 1L, 0L), + skip = dplyr::if_else(file_has_header, 1L, 0L), progress = FALSE, show_col_types = FALSE ) diff --git a/R/has_header.R b/R/has_header.R new file mode 100644 index 0000000..9590fcf --- /dev/null +++ b/R/has_header.R @@ -0,0 +1,32 @@ +has_header <- function(filepath, encoding = NULL, delimiter = NULL) { + if (!is_text_file(filepath)) { + return(NA) + } + + if (is.null(encoding)) { + encoding <- guess_file_encoding(filepath) + } + + if (is.null(delimiter)) { + delimiter <- guess_delimiter(filepath) + } + + if (any(is.na(c(encoding, delimiter)))) { + return(NA) + } + + header_types <- + determine_header_types( + filepath = filepath, + encoding = encoding, + delimiter = delimiter + ) + + if (all(header_types %in% c("character", "logical"))) { + has_header <- TRUE + } else { + has_header <- FALSE + } + + has_header +} diff --git a/R/read_first_line.R b/R/read_first_line.R new file mode 100644 index 0000000..cd188a4 --- /dev/null +++ b/R/read_first_line.R @@ -0,0 +1,27 @@ +read_first_line <- function(filepath, encoding = NULL, delimiter = NULL) { + if (!is_text_file(filepath)) { + return(NA) + } + + if (is.null(encoding)) { + encoding <- guess_file_encoding(filepath) + } + + if (is.null(delimiter)) { + delimiter <- guess_delimiter(filepath) + } + + if (any(is.na(c(encoding, delimiter)))) { + return(NA) + } + + readr::read_delim( + file = filepath, + delim = delimiter, + n_max = 1L, + locale = readr::locale(encoding = encoding), + col_names = FALSE, + show_col_types = FALSE, + progress = FALSE + ) +} diff --git a/R/read_portfolio_csv.R b/R/read_portfolio_csv.R index 8a38400..d7716e9 100644 --- a/R/read_portfolio_csv.R +++ b/R/read_portfolio_csv.R @@ -64,6 +64,13 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) { headers <- determine_headers(filepath) + file_has_header <- + has_header( + filepath = filepath, + encoding = encoding, + delimiter = delimiter + ) + if (length(headers) == 3) { col_types <- readr::cols( @@ -87,7 +94,7 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) { portfolio_df <- readr::read_delim( file = filepath, - skip = 1L, + skip = dplyr::if_else(file_has_header, 1L, 0L), col_names = names(headers), col_types = col_types, locale = locale, diff --git a/tests/testthat/test-determine_header_types.R b/tests/testthat/test-determine_header_types.R new file mode 100644 index 0000000..157c6f7 --- /dev/null +++ b/tests/testthat/test-determine_header_types.R @@ -0,0 +1,15 @@ +test_that("properly identifies header types of standard portfolio CSV", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) + expect_identical(determine_header_types(csv_file), rep("character", 5)) + + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('isin,currency,market_value\nXS1088274672,1000.34,USD', csv_file) + expect_identical(determine_header_types(csv_file), rep("character", 3)) +}) + +test_that("properly identifies header types with a numeric header", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('isin,123', csv_file) + expect_identical(determine_header_types(csv_file), c("character", "double")) +}) diff --git a/tests/testthat/test-has_header.R b/tests/testthat/test-has_header.R new file mode 100644 index 0000000..c5e3546 --- /dev/null +++ b/tests/testthat/test-has_header.R @@ -0,0 +1,17 @@ +test_that("identifies a portfolio CSV with only a header but no data", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('investor_name,portfolio_name,isin,currency,market_value', csv_file) + expect_true(has_header(csv_file)) +}) + +test_that("identifies a portfolio CSV with only data and no header", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('Investor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) + expect_false(has_header(csv_file)) +}) + +test_that("identifies a portfolio CSV with a header and data", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file) + expect_true(has_header(csv_file)) +}) diff --git a/tests/testthat/test-read_first_line.R b/tests/testthat/test-read_first_line.R new file mode 100644 index 0000000..d10415d --- /dev/null +++ b/tests/testthat/test-read_first_line.R @@ -0,0 +1,45 @@ +test_that("properly reads first line of standard portfolio CSV", { + portfolio <- + data.frame( + isin = "", + currency = "USD", + market_value = 1000.34 + ) + + expected_result <- + tibble::tibble( + "X1" = "isin", + "X2" = "currency", + "X3" = "market_value" + ) + + csv_file <- withr::local_tempfile(fileext = ".csv") + readr::write_csv(portfolio, file = csv_file) + expect_identical(read_first_line(csv_file), expected_result) +}) + +test_that("properly reads first line of CSV with only a header", { + expected_result <- + tibble::tibble( + "X1" = "isin", + "X2" = "currency", + "X3" = "market_value" + ) + + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines('isin,currency,market_value', csv_file) + expect_identical(read_first_line(csv_file), expected_result) +}) + +test_that("properly reads first line of CSV with no header", { + expected_result <- + tibble::tibble( + "X1" = "XS1088274672", + "X2" = 1000.34, + "X3" = "USD" + ) + + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines("XS1088274672,1000.34,USD", csv_file) + expect_identical(read_first_line(csv_file), expected_result) +}) diff --git a/tests/testthat/test-read_portfolio_csv.R b/tests/testthat/test-read_portfolio_csv.R index ba89045..efeddb3 100644 --- a/tests/testthat/test-read_portfolio_csv.R +++ b/tests/testthat/test-read_portfolio_csv.R @@ -186,3 +186,17 @@ test_that("reads a portfolio CSV with numeric names as characters", { expect_type(result$investor_name, "character") expect_type(result$portfolio_name, "character") }) + +test_that("deals with a portfolio CSV with only a header appropriately", { + csv_file <- withr::local_tempfile(fileext = ".csv") + readr::write_csv(portfolio_min[0, ], file = csv_file) + result <- read_portfolio_csv(csv_file) + expect_equal(result, NA) +}) + +test_that("deals with a portfolio CSV with no header and only one row of data", { + csv_file <- withr::local_tempfile(fileext = ".csv") + writeLines(paste(portfolio_min, collapse = ","), csv_file) + result <- read_portfolio_csv(csv_file) + expect_equal(unlist(result), unlist(portfolio_min)) +})