From 5a449a1f5485bca10c96fe38aa0b54c949e391bc Mon Sep 17 00:00:00 2001
From: CJ Yetman <cyetman@rmi.org>
Date: Wed, 31 Jan 2024 12:31:59 +0100
Subject: [PATCH] deal with files with only a header line and no data without
 error (#75)

closes #73
supersedes #74

This PR contains a lot to get to the root of the problem and not
duplicate code bits that are used in multiple places.

1. Three new, unexported, utility functions are added (with tests)
`has_header()`, `determine_header_types()`, and `read_first_line()`, and
they are utilized in other functions.
2. `guess_numerical_mark()` is made more robust so that it will not
error on a file with a single line (either header only or data only),
and tests are added to keep it that way. One class of problem that I
don't have a solution for is a CSV file with one line of data (no
header) that has a numerical mark (used in an unquoted numeric) that
collides with the delimiter, e.g. `XS1088274672,1,000.34,USD`. There is
ambiguity in there that I can't resolve.
3. `read_portfolio_csv()` is made more robust so that it will not error
on a file with only a header and no data (returns as expected NA) and
tests for that are added.
4. Also introduced the idea of optionally passing `encoding` and
`delimiter` to these utility functions so that they do not have to be
determined so many times during the process of reading a portfolio CSV.
Eventually, this concept should spread to other functions.
---
 R/determine_header_types.R                   | 37 ++++++++++++++++
 R/determine_headers.R                        | 33 ++++++++------
 R/guess_numerical_mark.R                     | 20 ++++++---
 R/has_header.R                               | 32 ++++++++++++++
 R/read_first_line.R                          | 27 ++++++++++++
 R/read_portfolio_csv.R                       |  9 +++-
 tests/testthat/test-determine_header_types.R | 15 +++++++
 tests/testthat/test-has_header.R             | 17 ++++++++
 tests/testthat/test-read_first_line.R        | 45 ++++++++++++++++++++
 tests/testthat/test-read_portfolio_csv.R     | 14 ++++++
 10 files changed, 229 insertions(+), 20 deletions(-)
 create mode 100644 R/determine_header_types.R
 create mode 100644 R/has_header.R
 create mode 100644 R/read_first_line.R
 create mode 100644 tests/testthat/test-determine_header_types.R
 create mode 100644 tests/testthat/test-has_header.R
 create mode 100644 tests/testthat/test-read_first_line.R

diff --git a/R/determine_header_types.R b/R/determine_header_types.R
new file mode 100644
index 0000000..7bd66da
--- /dev/null
+++ b/R/determine_header_types.R
@@ -0,0 +1,37 @@
+determine_header_types <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  first_line <-
+    read_first_line(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
+
+  vapply(
+    X = readr::spec(first_line)$cols,
+    FUN = function(x) {
+      sub(
+        pattern = "^collector_",
+        replacement = "",
+        x = class(x)[[1]]
+      )
+    },
+    FUN.VALUE = character(1),
+    USE.NAMES = FALSE
+  )
+}
diff --git a/R/determine_headers.R b/R/determine_headers.R
index c96134c..5f12bb1 100644
--- a/R/determine_headers.R
+++ b/R/determine_headers.R
@@ -27,22 +27,29 @@ determine_headers <- function(filepath) {
     return(NA_character_)
   }
 
-  locale <-
-    readr::locale(
-      encoding = encoding
+  file_has_header <-
+    has_header(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
     )
 
-  first_line <- readr::read_delim(filepath, delim = delimiter, n_max = 1L, locale = locale, col_names = FALSE, show_col_types = FALSE, progress = FALSE)
-  header_types <- vapply(X = readr::spec(first_line)$cols, FUN = function(x) sub("^collector_", "", class(x)[[1]]), FUN.VALUE = character(1), USE.NAMES = FALSE)
+  first_line <-
+    read_first_line(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
 
-  if (all(header_types %in% c("character", "logical"))) {
-    has_header <- TRUE
-  } else {
-    has_header <- FALSE
-  }
+  header_types <-
+    determine_header_types(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
 
-  if (!has_header) {
-    num_of_cols <- length(header_types)
+  if (!file_has_header) {
+    num_of_cols <- ncol(first_line)
     if (num_of_cols == 3) {
       col_names <- c("isin", "market_value", "currency")
       names(col_names) <- col_names
@@ -92,7 +99,7 @@ determine_headers <- function(filepath) {
   if (not_portfolio_csv) {
     return(NA_character_)
   }
-  if (!has_header) {
+  if (!file_has_header) {
     return(col_names)
   }
 
diff --git a/R/guess_numerical_mark.R b/R/guess_numerical_mark.R
index 53bd6e2..4f2c063 100644
--- a/R/guess_numerical_mark.R
+++ b/R/guess_numerical_mark.R
@@ -31,17 +31,25 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
       encoding <- guess_file_encoding(filepath)
       delimiter <- guess_delimiter(filepath)
 
-      cust_locale <-
-        readr::locale(
-          encoding = encoding
+      # return NA if file only has a header row and no data
+      file_has_header <-
+        has_header(
+          filepath = filepath,
+          encoding = encoding,
+          delimiter = delimiter
         )
 
+      only_one_line <- length(readLines(con = filepath, n = 2L)) < 2
+
+      if (file_has_header && only_one_line) {
+        return(NA_character_)
+      }
+
       # determine appropriate column
       headers <- determine_headers(filepath)
 
       if ("market_value" %in% names(headers)) {
         num_col_idx <- match("market_value", names(headers))
-        has_header_row <- TRUE
       } else {
         warning("`market_value` column could not be determined")
         return(NA_character_)
@@ -52,12 +60,12 @@ guess_numerical_mark <- function(filepaths, type = "decimal") {
         readr::read_delim(
           file = filepath,
           delim = delimiter,
-          locale = cust_locale,
+          locale = readr::locale(encoding = encoding),
           trim_ws = TRUE,
           col_types = readr::cols(.default = "c"),
           col_names = FALSE,
           col_select = dplyr::all_of(num_col_idx),
-          skip = dplyr::if_else(has_header_row, 1L, 0L),
+          skip = dplyr::if_else(file_has_header, 1L, 0L),
           progress = FALSE,
           show_col_types = FALSE
         )
diff --git a/R/has_header.R b/R/has_header.R
new file mode 100644
index 0000000..9590fcf
--- /dev/null
+++ b/R/has_header.R
@@ -0,0 +1,32 @@
+has_header <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  header_types <-
+    determine_header_types(
+      filepath = filepath,
+      encoding = encoding,
+      delimiter = delimiter
+    )
+
+  if (all(header_types %in% c("character", "logical"))) {
+    has_header <- TRUE
+  } else {
+    has_header <- FALSE
+  }
+
+  has_header
+}
diff --git a/R/read_first_line.R b/R/read_first_line.R
new file mode 100644
index 0000000..cd188a4
--- /dev/null
+++ b/R/read_first_line.R
@@ -0,0 +1,27 @@
+read_first_line <- function(filepath, encoding = NULL, delimiter = NULL) {
+  if (!is_text_file(filepath)) {
+    return(NA)
+  }
+
+  if (is.null(encoding)) {
+    encoding <- guess_file_encoding(filepath)
+  }
+
+  if (is.null(delimiter)) {
+    delimiter <- guess_delimiter(filepath)
+  }
+
+  if (any(is.na(c(encoding, delimiter)))) {
+    return(NA)
+  }
+
+  readr::read_delim(
+    file = filepath,
+    delim = delimiter,
+    n_max = 1L,
+    locale = readr::locale(encoding = encoding),
+    col_names = FALSE,
+    show_col_types = FALSE,
+    progress = FALSE
+  )
+}
diff --git a/R/read_portfolio_csv.R b/R/read_portfolio_csv.R
index 8a38400..d7716e9 100644
--- a/R/read_portfolio_csv.R
+++ b/R/read_portfolio_csv.R
@@ -64,6 +64,13 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {
 
       headers <- determine_headers(filepath)
 
+      file_has_header <-
+        has_header(
+          filepath = filepath,
+          encoding = encoding,
+          delimiter = delimiter
+        )
+
       if (length(headers) == 3) {
         col_types <-
           readr::cols(
@@ -87,7 +94,7 @@ read_portfolio_csv <- function(filepaths, combine = TRUE) {
       portfolio_df <-
         readr::read_delim(
           file = filepath,
-          skip = 1L,
+          skip = dplyr::if_else(file_has_header, 1L, 0L),
           col_names = names(headers),
           col_types = col_types,
           locale = locale,
diff --git a/tests/testthat/test-determine_header_types.R b/tests/testthat/test-determine_header_types.R
new file mode 100644
index 0000000..157c6f7
--- /dev/null
+++ b/tests/testthat/test-determine_header_types.R
@@ -0,0 +1,15 @@
+test_that("properly identifies header types of standard portfolio CSV", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_identical(determine_header_types(csv_file), rep("character", 5))
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,currency,market_value\nXS1088274672,1000.34,USD', csv_file)
+  expect_identical(determine_header_types(csv_file), rep("character", 3))
+})
+
+test_that("properly identifies header types with a numeric header", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,123', csv_file)
+  expect_identical(determine_header_types(csv_file), c("character", "double"))
+})
diff --git a/tests/testthat/test-has_header.R b/tests/testthat/test-has_header.R
new file mode 100644
index 0000000..c5e3546
--- /dev/null
+++ b/tests/testthat/test-has_header.R
@@ -0,0 +1,17 @@
+test_that("identifies a portfolio CSV with only a header but no data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value', csv_file)
+  expect_true(has_header(csv_file))
+})
+
+test_that("identifies a portfolio CSV with only data and no header", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('Investor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_false(has_header(csv_file))
+})
+
+test_that("identifies a portfolio CSV with a header and data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('investor_name,portfolio_name,isin,currency,market_value\nInvestor Name,Portfolio Name,XS1088274672,1000.34,USD', csv_file)
+  expect_true(has_header(csv_file))
+})
diff --git a/tests/testthat/test-read_first_line.R b/tests/testthat/test-read_first_line.R
new file mode 100644
index 0000000..d10415d
--- /dev/null
+++ b/tests/testthat/test-read_first_line.R
@@ -0,0 +1,45 @@
+test_that("properly reads first line of standard portfolio CSV", {
+  portfolio <-
+    data.frame(
+      isin = "",
+      currency = "USD",
+      market_value = 1000.34
+    )
+
+  expected_result <-
+    tibble::tibble(
+      "X1" = "isin",
+      "X2" = "currency",
+      "X3" = "market_value"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  readr::write_csv(portfolio, file = csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
+
+test_that("properly reads first line of CSV with only a header", {
+  expected_result <-
+    tibble::tibble(
+      "X1" = "isin",
+      "X2" = "currency",
+      "X3" = "market_value"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines('isin,currency,market_value', csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
+
+test_that("properly reads first line of CSV with no header", {
+  expected_result <-
+    tibble::tibble(
+      "X1" = "XS1088274672",
+      "X2" = 1000.34,
+      "X3" = "USD"
+    )
+
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines("XS1088274672,1000.34,USD", csv_file)
+  expect_identical(read_first_line(csv_file), expected_result)
+})
diff --git a/tests/testthat/test-read_portfolio_csv.R b/tests/testthat/test-read_portfolio_csv.R
index ba89045..efeddb3 100644
--- a/tests/testthat/test-read_portfolio_csv.R
+++ b/tests/testthat/test-read_portfolio_csv.R
@@ -186,3 +186,17 @@ test_that("reads a portfolio CSV with numeric names as characters", {
   expect_type(result$investor_name, "character")
   expect_type(result$portfolio_name, "character")
 })
+
+test_that("deals with a portfolio CSV with only a header appropriately", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  readr::write_csv(portfolio_min[0, ], file = csv_file)
+  result <- read_portfolio_csv(csv_file)
+  expect_equal(result, NA)
+})
+
+test_that("deals with a portfolio CSV with no header and only one row of data", {
+  csv_file <- withr::local_tempfile(fileext = ".csv")
+  writeLines(paste(portfolio_min, collapse = ","), csv_file)
+  result <- read_portfolio_csv(csv_file)
+  expect_equal(unlist(result), unlist(portfolio_min))
+})