to_dicts

bcpeinhardt · Oct 27, 2024 · 27e5f91 · 27e5f91
1 parent 2dcfe61
commit 27e5f91
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 65 deletions.
diff --git a/gleam.toml b/gleam.toml
@@ -13,6 +13,7 @@ internal_modules = [
 
 [dependencies]
 gleam_stdlib = ">= 0.40.0 and < 1.0.0"
+glearray = ">= 1.0.0 and < 2.0.0"
 
 [dev-dependencies]
 gleeunit = "~> 1.0"

diff --git a/manifest.toml b/manifest.toml
@@ -12,6 +12,7 @@ packages = [
   { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" },
   { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" },
   { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" },
+  { name = "glearray", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glearray", source = "hex", outer_checksum = "B99767A9BC63EF9CC8809F66C7276042E5EFEACAA5B25188B552D3691B91AC6D" },
   { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" },
   { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" },
   { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" },
@@ -24,4 +25,5 @@ packages = [
 [requirements]
 birdie = { version = ">= 1.2.3 and < 2.0.0" }
 gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" }
+glearray = { version = ">= 1.0.0 and < 2.0.0" }
 gleeunit = { version = "~> 1.0" }
diff --git a/src/gsv.gleam b/src/gsv.gleam
@@ -2,17 +2,22 @@ import gleam/dict.{type Dict}
 import gleam/list
 import gleam/result
 import gleam/string
+import glearray
 
 // --- TYPES -------------------------------------------------------------------
 
 pub type ParseError {
+  /// This error can occur if there is a csv field contains an unescaped double
+  /// quote `"`.
+  ///
   /// A field can contain a double quote only if it is escaped (that is,
-  /// surrounded by double quotes). For example `wobb"le` would be an invalid
+  /// surrounded by double quotes). For example `wibb"le` would be an invalid
   /// field, the correct way to write such a field would be like this:
-  /// `"wobb""le"`.
+  /// `"wibb""le"`.
   ///
   UnescapedQuote(
     /// The byte index of the unescaped double.
+    ///
     position: Int,
   )
 
@@ -21,21 +26,45 @@ pub type ParseError {
   ///
   UnclosedEscapedField(
     /// The byte index of the start of the unclosed escaped field.
+    ///
     start: Int,
   )
 }
 
+/// Possible line endings used when turning a parsed csv back into a string
+/// with the `from_lists` and `from_dicts` functions.
+///
+pub type LineEnding {
+  /// The CRLF line ending: `\r\n`.
+  ///
+  Windows
+
+  /// The LF line ending: `\n`.
+  Unix
+}
+
+fn le_to_string(le: LineEnding) -> String {
+  case le {
+    Windows -> "\r\n"
+    Unix -> "\n"
+  }
+}
+
 // --- PARSING -----------------------------------------------------------------
 
-/// Parses a csv string into a list of lists of strings.
+/// Parses a csv string into a list of lists of strings: each line of the csv
+/// will be turned into a list with an item for each field.
+///
 /// ## Examples
 ///
 /// ```gleam
 /// "hello, world
-/// goodbye, mars
-/// "
+/// goodbye, mars"
 /// |> gsv.to_lists
-/// // [["hello", " world"], ["goodbye", " mars"]]
+/// // Ok([
+/// //    ["hello", " world"],
+/// //    ["goodbye", " mars"],
+/// // ])
 /// ```
 ///
 /// > This implementation tries to stick as closely as possible to
@@ -62,9 +91,22 @@ pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
 /// This is used to keep track of what the parser is doing.
 ///
 type ParseStatus {
+  /// We're in the middle of parsing an escaped csv field (that is, starting
+  /// and ending with `"`).
+  ///
   ParsingEscapedField
+
+  /// We're in the middle of parsing a regular csv field.
+  ///
   ParsingUnescapedField
+
+  /// We've just ran into a (non escaped) comma, signalling the end of a field.
+  ///
   CommaFound
+
+  /// We've just ran into a (non escaped) newline (either a `\n` or `\r\n`),
+  /// signalling the end of a line and the start of a new one.
+  ///
   NewlineFound
 }
 
@@ -268,67 +310,72 @@ fn extract_field(
   let field = slice_bytes(string, from, length)
   case status {
     CommaFound | ParsingUnescapedField | NewlineFound -> field
+    // If we were parsing an escaped field then escaped quotes must be replaced
+    // with a single one.
     ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
   }
 }
 
-/// Parses a csv string to a list of dicts.
-/// Automatically handles Windows and Unix line endings.
-/// Returns a string error msg if the string is not valid csv.
-/// Unquoted strings are trimmed, while quoted strings have leading and trailing
-/// whitespace preserved.
-/// Whitespace only or empty strings are not valid headers and will be ignored.
-/// Whitespace only or empty strings are not considered "present" in the csv row and
-/// are not inserted into the row dict.
+/// Parses a csv string into a list of dicts: the first line of the csv is
+/// interpreted as the headers' row and each of the following lines is turned
+/// into a dict with a value for each of the headers.
+///
+/// If a field is empty then it won't be added to the dict.
+///
+/// ## Examples
+///
+/// ```gleam
+/// "pet,name,cuteness
+/// dog,Fido,100
+/// cat,,1000
+/// "
+/// |> gsv.to_dicts
+/// // Ok([
+/// //    dict.from_list([
+/// //      #("pet", "dog"), #("name", "Fido"), #("cuteness", "100")
+/// //    ]),
+/// //    dict.from_list([
+/// //      #("pet", "cat"), #("cuteness", "1000")
+/// //    ]),
+/// // ])
+/// ```
+///
+/// > Just list `to_lists` this implementation tries to stick as closely as
+/// > possible to [RFC4180](https://www.ietf.org/rfc/rfc4180.txt).
+/// > You can look at `to_lists`' documentation to see how it differs from the
+/// > RFC.
+///
 pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) {
-  use lol <- result.try(to_lists(input))
-  case lol {
+  use rows <- result.map(to_lists(input))
+  case rows {
     [] -> []
     [headers, ..rows] -> {
-      let headers =
-        list.index_fold(headers, dict.new(), fn(acc, x, i) {
-          case string.trim(x) == "" {
-            True -> acc
-            False -> dict.insert(acc, i, x)
-          }
-        })
+      let headers = glearray.from_list(headers)
 
-      list.map(rows, fn(row) {
-        use acc, x, i <- list.index_fold(row, dict.new())
-        case dict.get(headers, i) {
-          Error(Nil) -> acc
-          Ok(h) ->
-            case string.trim(x) {
-              "" -> acc
-              t -> dict.insert(acc, string.trim(h), t)
-            }
-        }
-      })
+      use row <- list.map(rows)
+      use row, field, index <- list.index_fold(row, dict.new())
+      case field {
+        // If the field is empty then we don't add it to the row's dict.
+        "" -> row
+        _ ->
+          // We look for the header corresponding to this field's position.
+          case glearray.get(headers, index) {
+            Ok(header) -> dict.insert(row, header, field)
+            // This could happen if the row has more fields than headers in the
+            // header row, in this case the field is just discarded
+            Error(_) -> row
+          }
+      }
     }
   }
-  |> Ok
-}
-
-/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
-/// line endings. Use with the `from_lists` function when
-/// writing to a csv string.
-pub type LineEnding {
-  Windows
-  Unix
-}
-
-fn le_to_string(le: LineEnding) -> String {
-  case le {
-    Windows -> "\r\n"
-    Unix -> "\n"
-  }
 }
 
 /// Takes a list of lists of strings and writes it to a csv string.
 /// Will automatically escape strings that contain double quotes or
 /// line endings with double quotes (in csv, double quotes get escaped by doing
 /// a double doublequote)
 /// The string `he"llo\n` becomes `"he""llo\n"`
+///
 pub fn from_lists(
   input: List(List(String)),
   separator separator: String,
@@ -360,6 +407,7 @@ pub fn from_lists(
 /// line endings with double quotes (in csv, double quotes get escaped by doing
 /// a double doublequote)
 /// The string `he"llo\n` becomes `"he""llo\n"`
+///
 pub fn from_dicts(
   input: List(Dict(String, String)),
   separator separator: String,
@@ -400,9 +448,9 @@ pub fn from_dicts(
 /// yield valid utf8 slices.
 ///
 @external(erlang, "gsv_ffi", "slice")
-@external(javascript, "../gsv_ffi.mjs", "slice")
+@external(javascript, "./gsv_ffi.mjs", "slice")
 fn slice_bytes(string: String, from: Int, length: Int) -> String
 
 @external(erlang, "gsv_ffi", "drop_bytes")
-@external(javascript, "../gsv_ffi.mjs", "drop_bytes")
+@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
 fn drop_bytes(string: String, bytes: Int) -> String
diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
@@ -133,7 +133,9 @@ pub fn escaped_field_with_escaped_double_quotes_test() {
 // --- DICT PARSING ------------------------------------------------------------
 
 pub fn headers_test() {
-  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
+  "name,age
+Ben,27,TRUE,Hello
+Austin,27,"
   |> gsv.to_dicts
   |> should.be_ok
   |> should.equal([
@@ -143,20 +145,30 @@ pub fn headers_test() {
 }
 
 pub fn dicts_with_empty_str_header_test() {
-  "name,\"  \",   ,,age\nBen,foo,bar,baz,27,extra_data"
+  "name,\"  \",   ,,age
+Ben,wibble,wobble,woo,27,extra_data"
   |> gsv.to_dicts
   |> should.be_ok
-  |> gsv.from_dicts(",", Unix)
-  |> should.equal("age,name\n27,Ben")
+  |> should.equal([
+    dict.from_list([
+      #("name", "Ben"),
+      #("  ", "wibble"),
+      #("   ", "wobble"),
+      #("", "woo"),
+      #("age", "27"),
+    ]),
+  ])
 }
 
 pub fn dicts_with_empty_values_test() {
-  "name, age\nBen,,,,\nAustin, 27"
+  "name,age
+Ben,,,,
+Austin,27"
   |> gsv.to_dicts
   |> should.be_ok
   |> should.equal([
     dict.from_list([#("name", "Ben")]),
-    dict.from_list([#("age", "27"), #("name", "Austin")]),
+    dict.from_list([#("name", "Austin"), #("age", "27")]),
   ])
 }
 
@@ -221,21 +233,33 @@ Austin, 25, FALSE"
 }
 
 pub fn encode_with_escaped_string_windows_test() {
-  let assert Ok(lls) =
-    "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
+  let assert Ok(rows) =
+    "Ben, 25,' TRUE\n\r'' '
+Austin, 25, FALSE"
+    |> string.replace(each: "'", with: "\"")
     |> gsv.to_lists
 
-  lls
+  rows
   |> gsv.from_lists(separator: ",", line_ending: Windows)
-  |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE")
+  |> string.replace(each: "\"", with: "'")
+  |> should.equal(
+    "Ben, 25,' TRUE\n\r'' '\r
+Austin, 25, FALSE",
+  )
 }
 
 pub fn dicts_round_trip_test() {
-  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
+  "name,age
+Ben,27,TRUE,Hello
+Austin,27,"
   |> gsv.to_dicts
   |> should.be_ok
   |> gsv.from_dicts(",", Unix)
-  |> should.equal("age,name\n27,Ben\n27,Austin")
+  |> should.equal(
+    "age,name
+27,Ben
+27,Austin",
+  )
 }
 
 // --- TEST HELPERS ------------------------------------------------------------
@@ -312,5 +336,5 @@ fn do_position_to_line_and_column(
 }
 
 @external(erlang, "gsv_ffi", "drop_bytes")
-@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes")
+@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
 fn drop_bytes(string: String, bytes: Int) -> String