From b87cde6bfc67fcacd6dbc39e984146f71911e729 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 14:11:26 +0200
Subject: [PATCH 01/23] wip

---
 gleam.toml                   |   2 +-
 manifest.toml                |   6 +-
 src/gsv.gleam                |  14 +-
 src/gsv/internal/parse.gleam | 261 +++++++++++++++++++++++++++++++++++
 src/gsv_ffi.erl              |  11 ++
 src/gsv_ffi.mjs              |   7 +
 6 files changed, 290 insertions(+), 11 deletions(-)
 create mode 100644 src/gsv/internal/parse.gleam
 create mode 100644 src/gsv_ffi.erl
 create mode 100644 src/gsv_ffi.mjs

diff --git a/gleam.toml b/gleam.toml
index 0161427..5080dd0 100644
--- a/gleam.toml
+++ b/gleam.toml
@@ -12,7 +12,7 @@ internal_modules = [
 ]
 
 [dependencies]
-gleam_stdlib = "~> 0.34 or ~> 1.0"
+gleam_stdlib = ">= 0.40.0 and < 1.0.0"
 
 [dev-dependencies]
 gleeunit = "~> 1.0"
diff --git a/manifest.toml b/manifest.toml
index 7762492..5cfd9ac 100644
--- a/manifest.toml
+++ b/manifest.toml
@@ -2,10 +2,10 @@
 # You typically do not need to edit this file
 
 packages = [
-  { name = "gleam_stdlib", version = "0.34.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "1FB8454D2991E9B4C0C804544D8A9AD0F6184725E20D63C3155F0AEB4230B016" },
-  { name = "gleeunit", version = "1.0.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "D364C87AFEB26BDB4FB8A5ABDE67D635DC9FA52D6AB68416044C35B096C6882D" },
+  { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" },
+  { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" },
 ]
 
 [requirements]
-gleam_stdlib = { version = "~> 0.34 or ~> 1.0" }
+gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" }
 gleeunit = { version = "~> 1.0" }
diff --git a/src/gsv.gleam b/src/gsv.gleam
index 25b031a..b3dafa1 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -9,7 +9,7 @@ import gsv/internal/token.{Location}
 /// Parses a csv string to a list of lists of strings.
 /// Automatically handles Windows and Unix line endings.
 /// Returns a string error msg if the string is not valid csv.
-/// Unquoted strings are trimmed, while quoted strings have leading and trailing 
+/// Unquoted strings are trimmed, while quoted strings have leading and trailing
 /// whitespace preserved.
 pub fn to_lists(input: String) -> Result(List(List(String)), String) {
   input
@@ -28,14 +28,14 @@ pub fn to_lists(input: String) -> Result(List(List(String)), String) {
   })
 }
 
-/// Parses a csv string to a list of dicts. 
+/// Parses a csv string to a list of dicts.
 /// Automatically handles Windows and Unix line endings.
 /// Returns a string error msg if the string is not valid csv.
-/// Unquoted strings are trimmed, while quoted strings have leading and trailing 
+/// Unquoted strings are trimmed, while quoted strings have leading and trailing
 /// whitespace preserved.
-/// Whitespace only or empty strings are not valid headers and will be ignored. 
-/// Whitespace only or empty strings are not considered "present" in the csv row and 
-/// are not inserted into the row dict. 
+/// Whitespace only or empty strings are not valid headers and will be ignored.
+/// Whitespace only or empty strings are not considered "present" in the csv row and
+/// are not inserted into the row dict.
 pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) {
   use lol <- result.try(to_lists(input))
   case lol {
@@ -66,7 +66,7 @@ pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) {
 }
 
 /// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
-/// line endings. Use with the `from_lists` function when 
+/// line endings. Use with the `from_lists` function when
 /// writing to a csv string.
 pub type LineEnding {
   Windows
diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam
new file mode 100644
index 0000000..e3e462e
--- /dev/null
+++ b/src/gsv/internal/parse.gleam
@@ -0,0 +1,261 @@
+import gleam/list
+import gleam/string
+
+pub fn parse(string) -> Result(List(List(String)), ParseError) {
+  case string {
+    // We just ignore all unescaped newlines at the beginning of a file.
+    "\n" <> rest | "\r\n" <> rest -> parse(rest)
+    // If it starts with a `"` then we know it starts with an escaped field.
+    "\"" <> rest -> do_parse(rest, string, 1, 0, [], [], ParsingEscapedField)
+    // If it starts with a `,` then it starts with an empty field we're filling
+    // out manually.
+    "," <> rest -> do_parse(rest, string, 1, 0, [""], [], CommaFound)
+    // Otherwise we just start parsing the first unescaped field.
+    _ -> do_parse(string, string, 0, 0, [], [], ParsingUnescapedField)
+  }
+}
+
+pub type ParseError {
+  /// A field can contain a double quote only if it is escaped (that is,
+  /// surrounded by double quotes). For example `wobb"le` would be an invalid
+  /// field, the correct way to write such a field would be like this:
+  /// `"wobb""le"`.
+  ///
+  UnescapedQuote(
+    /// The byte index of the unescaped double.
+    position: Int,
+  )
+
+  /// This error can occur when the file ends without the closing `"` of an
+  /// escaped field. For example: `"hello`.
+  ///
+  UnclosedEscapedField(
+    /// The byte index of the start of the unclosed escaped field.
+    start: Int,
+  )
+}
+
+type ParseStatus {
+  ParsingEscapedField
+  ParsingUnescapedField
+  CommaFound
+  NewlineFound
+}
+
+/// ## What does this scary looking function do?
+///
+/// At a high level, it goes over the csv `string` byte-by-byte and parses rows
+/// accumulating those into `rows` as it goes.
+///
+///
+/// ## Why does it have all these parameters? What does each one do?
+///
+/// In order to be extra efficient this function parses the csv file in a single
+/// pass and uses string slicing to avoid copying data.
+/// Each time we see a new field we keep track of the byte where it starts with
+/// `field_start` and then count the bytes (that's the `field_length` variable)
+/// until we fiend its end (either a newline, the end of the file, or a `,`).
+///
+/// After reaching the end of a field we extract it from the original string
+/// taking a slice that goes from `field_start` and has `field_length` bytes.
+/// This is where the magic happens: slicing a string this way is a constant
+/// time operation and doesn't copy the string so it's crazy fast!
+///
+/// `row` is an accumulator with all the fields of the current row as
+/// they are parsed. Once we run into a newline `current_row` is added to all
+/// the other `rows`.
+///
+/// We also keep track of _what_ we're parsing with the `status` to make
+/// sure that we're correctly dealing with escaped fields and double quotes.
+///
+fn do_parse(
+  string: String,
+  original: String,
+  field_start: Int,
+  field_length: Int,
+  row: List(String),
+  rows: List(List(String)),
+  status: ParseStatus,
+) -> Result(List(List(String)), ParseError) {
+  case string, status {
+    // If we find a comma we're done with the current field and can take a slice
+    // going from `field_start` with `field_length` bytes:
+    //
+    //     wibble,wobble,...
+    //     ╰────╯ field_length = 6
+    //     ┬
+    //     ╰ field_start
+    //
+    // After taking the slice we move the slice start _after_ the comma:
+    //
+    //     wibble,wobble,...
+    //            ┬
+    //            ╰ field_start = field_start + field_length + 1 (the comma)
+    //
+    "," <> rest, CommaFound
+    | "," <> rest, NewlineFound
+    | "," <> rest, ParsingUnescapedField
+    -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = [field, ..row]
+      let field_start = field_start + field_length + 1
+      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
+    }
+    "\"," <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = [field, ..row]
+      let field_start = field_start + field_length + 2
+      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
+    }
+
+    // When the string is over we're done parsing.
+    // We take the final field we were in the middle of parsing and add it to
+    // the current row that is returned together with all the parsed rows.
+    //
+    "", ParsingUnescapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      Ok(list.reverse([row, ..rows]))
+    }
+
+    "", CommaFound -> {
+      let row = list.reverse(["", ..row])
+      Ok(list.reverse([row, ..rows]))
+    }
+
+    "", NewlineFound -> Ok(list.reverse(rows))
+
+    // If the string is over and we were parsing an escaped field, that's an
+    // error. We would expect to find a closing double quote before the end of
+    // the data.
+    //
+    "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start))
+
+    // When we run into a new line (CRLF or just LF) we know we're done with the
+    // current field and take a slice of it, just like we did in the previous
+    // branch!
+    // The only difference is we also add the current `row` to all the other
+    // ones and start with a new one.
+    //
+    // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF.
+    // > Here we do something slightly different and also accept lines that are
+    // > delimited by just LF too.
+    //
+    // The next three branches are the same except for the new `field_start`
+    // that has to take into account the different lengths.
+    // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking
+    // adding the lenght of that but it had a noticeable (albeit small) impact
+    // on performance.
+    //
+    "\n" <> rest, ParsingUnescapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 1
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+    "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 2
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+    "\"\r\n" <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 3
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+
+    // If the newlines is immediately after a comma then the row ends with an
+    // empty field.
+    //
+    "\n" <> rest, CommaFound -> {
+      let row = list.reverse(["", ..row])
+      let rows = [row, ..rows]
+      do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound)
+    }
+    "\r\n" <> rest, CommaFound -> {
+      let row = list.reverse(["", ..row])
+      let rows = [row, ..rows]
+      do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound)
+    }
+
+    // If the newline immediately comes after a newline that means we've run
+    // into an empty line that we can just safely ignore.
+    //
+    "\n" <> rest, NewlineFound ->
+      do_parse(rest, original, field_start + 1, 0, row, rows, status)
+    "\r\n" <> rest, NewlineFound ->
+      do_parse(rest, original, field_start + 2, 0, row, rows, status)
+
+    // An escaped quote found while parsing an escaped field.
+    //
+    "\"\"" <> rest, ParsingEscapedField ->
+      do_parse(rest, original, field_start, field_length + 2, row, rows, status)
+
+    // An unescaped quote found while parsing a field.
+    //
+    "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField ->
+      Error(UnescapedQuote(position: field_start + field_length))
+
+    // If the quote is found immediately after a comma or a newline that signals
+    // the start of a new escaped field to parse.
+    //
+    "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> {
+      let status = ParsingEscapedField
+      do_parse(rest, original, field_start + 1, 0, row, rows, status)
+    }
+
+    // In all other cases we're still parsing a field so we just drop a byte
+    // from the string we're iterating through, increase the size of the slice
+    // we need to take and keep going.
+    //
+    // > ⚠️ Notice how we're not trying to trim any whitespaces at the
+    // > beginning or end of a field: RFC 4810 states that "Spaces are
+    // > considered part of a field and should not be ignored."
+    //
+    _, CommaFound
+    | _, NewlineFound
+    | _, ParsingUnescapedField
+    | _, ParsingEscapedField
+    -> {
+      let status = case status {
+        ParsingEscapedField -> ParsingEscapedField
+        CommaFound | NewlineFound | ParsingUnescapedField ->
+          ParsingUnescapedField
+      }
+      let rest = drop_bytes(string, 1)
+      do_parse(rest, original, field_start, field_length + 1, row, rows, status)
+    }
+  }
+}
+
+fn extract_field(
+  string: String,
+  from: Int,
+  length: Int,
+  status: ParseStatus,
+) -> String {
+  let field = slice_bytes(string, from, length)
+  case status {
+    CommaFound | ParsingUnescapedField | NewlineFound -> field
+    ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
+  }
+}
+
+/// In general this wouldn't be safe, by just slicing random bytes in the middle
+/// of a utf8 string we might end up with something that is not a valid utf8
+/// string.
+/// However, the parser only slices fields in between commas so it should always
+/// yield valid utf8 slices.
+///
+@external(erlang, "gsv_ffi", "slice")
+@external(javascript, "../../gsv_ffi.mjs", "slice")
+fn slice_bytes(string: String, from: Int, length: Int) -> String
+
+@external(erlang, "gsv_ffi", "drop_bytes")
+@external(javascript, "../../gsv_ffi.mjs", "drop_bytes")
+fn drop_bytes(string: String, bytes: Int) -> String
diff --git a/src/gsv_ffi.erl b/src/gsv_ffi.erl
new file mode 100644
index 0000000..acec49e
--- /dev/null
+++ b/src/gsv_ffi.erl
@@ -0,0 +1,11 @@
+-module(gsv_ffi).
+-export([slice/3, drop_bytes/2]).
+
+slice(String, Index, Length) ->
+    binary:part(String, Index, Length).
+
+drop_bytes(String, Bytes) ->
+    case String of
+        <<_:Bytes/bytes, Rest/binary>> -> Rest;
+        <<>> -> <<>>
+    end.
diff --git a/src/gsv_ffi.mjs b/src/gsv_ffi.mjs
new file mode 100644
index 0000000..468b4dc
--- /dev/null
+++ b/src/gsv_ffi.mjs
@@ -0,0 +1,7 @@
+export function slice(string, start, size) {
+  return string.slice(start, start + size);
+}
+
+export function drop_bytes(string, bytes) {
+  return string.slice(bytes);
+}

From 71c56ccafade8538d7503e95cae5086d446cf1dd Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 14:48:52 +0200
Subject: [PATCH 02/23] Remove deprecated code

---
 test/gsv_test.gleam | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index 4a14ebc..41291dc 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -1,8 +1,5 @@
 import gleam/dict
-import gleam/int
-import gleam/list
 import gleam/result
-import gleam/string
 import gleeunit
 import gleeunit/should
 import gsv.{Unix, Windows}
@@ -78,30 +75,6 @@ pub fn last_line_has_optional_line_ending_test() {
   |> should.equal(Ok([["test"], ["test"], ["test"]]))
 }
 
-// ---------- Example doing CSV string -> Custom type ------------------------
-pub type User {
-  User(name: String, age: Int)
-}
-
-fn from_list(record: List(String)) -> Result(User, Nil) {
-  use name <- result.try(list.at(record, 0))
-  use age_str <- result.try(list.at(record, 1))
-  use age <- result.try(int.parse(string.trim(age_str)))
-  Ok(User(name, age))
-}
-
-pub fn decode_to_type_test() {
-  let assert Ok(lls) =
-    "Ben, 25\nAustin, 21"
-    |> gsv.to_lists
-  let users =
-    list.fold(lls, [], fn(acc, record) { [from_list(record), ..acc] })
-    |> list.reverse
-
-  users
-  |> should.equal([Ok(User("Ben", 25)), Ok(User("Austin", 21))])
-}
-
 // ---------------------------------------------------------------------------
 
 pub fn encode_test() {

From 73d451d7b541aed88a268ac7735e087ac158dbf0 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 14:51:01 +0200
Subject: [PATCH 03/23] stop testing internals of the library

---
 test/gsv_test.gleam | 69 ---------------------------------------------
 1 file changed, 69 deletions(-)

diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index 41291dc..904acab 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -1,62 +1,18 @@
 import gleam/dict
-import gleam/result
 import gleeunit
 import gleeunit/should
 import gsv.{Unix, Windows}
-import gsv/internal/ast.{ParseError, parse}
-import gsv/internal/token.{
-  CR, Comma, Doublequote, LF, Location, Textdata, scan, with_location,
-}
 
 pub fn main() {
   gleeunit.main()
 }
 
-pub fn scan_test() {
-  "Ben, 25,\" TRUE\r\n\""
-  |> scan
-  |> should.equal([
-    Textdata("Ben"),
-    Comma,
-    Textdata(" 25"),
-    Comma,
-    Doublequote,
-    Textdata(" TRUE"),
-    CR,
-    LF,
-    Doublequote,
-  ])
-}
-
-pub fn parse_test() {
-  "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE"
-  |> scan
-  |> with_location
-  |> parse
-  |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]]))
-}
-
-pub fn parse_empty_string_fail_test() {
-  ""
-  |> scan
-  |> with_location
-  |> parse
-  |> result.nil_error
-  |> should.equal(Error(Nil))
-}
-
 pub fn csv_parse_test() {
   "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE"
   |> gsv.to_lists
   |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]]))
 }
 
-pub fn scan_crlf_test() {
-  "\r\n"
-  |> scan
-  |> should.equal([CR, LF])
-}
-
 pub fn parse_crlf_test() {
   "test\ntest\r\ntest"
   |> gsv.to_lists
@@ -116,31 +72,6 @@ pub fn for_the_readme_test() {
   |> should.equal("Hello,World\r\nGoodbye,Mars")
 }
 
-pub fn error_cases_test() {
-  let produce_error = fn(csv_str) {
-    case
-      csv_str
-      |> scan
-      |> with_location
-      |> parse
-    {
-      Ok(_) -> panic as "Expected an error"
-      Error(ParseError(loc, msg)) -> #(loc, msg)
-    }
-  }
-
-  produce_error("Ben, 25,\n, TRUE")
-  |> should.equal(#(
-    Location(2, 1),
-    "Expected escaped or non-escaped string after newline, found: ,",
-  ))
-  produce_error("Austin, 25, FALSE\n\"Ben Peinhardt\", 25,\n, TRUE")
-  |> should.equal(#(
-    Location(3, 1),
-    "Expected escaped or non-escaped string after newline, found: ,",
-  ))
-}
-
 // pub fn totally_panics_test() {
 //   "Ben, 25,, TRUE" |> gsv.to_lists_or_panic
 // }

From f7ddd4e67e63678df7574c34a44d224178cfbb4d Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 14:51:17 +0200
Subject: [PATCH 04/23] remove commented unused test

---
 test/gsv_test.gleam | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index 904acab..a3a4afc 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -72,10 +72,6 @@ pub fn for_the_readme_test() {
   |> should.equal("Hello,World\r\nGoodbye,Mars")
 }
 
-// pub fn totally_panics_test() {
-//   "Ben, 25,, TRUE" |> gsv.to_lists_or_panic
-// }
-
 pub fn totally_doesnt_error_test() {
   "Ben, 25,, TRUE"
   |> gsv.to_lists

From 8c7a66870e566076ac708849189d37f3fbaae6fb Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 15:46:12 +0200
Subject: [PATCH 05/23] fix small bug

---
 src/gsv/internal/parse.gleam | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam
index e3e462e..2c54d3d 100644
--- a/src/gsv/internal/parse.gleam
+++ b/src/gsv/internal/parse.gleam
@@ -112,7 +112,7 @@ fn do_parse(
     // We take the final field we were in the middle of parsing and add it to
     // the current row that is returned together with all the parsed rows.
     //
-    "", ParsingUnescapedField -> {
+    "", ParsingUnescapedField | "\"", ParsingEscapedField -> {
       let field = extract_field(original, field_start, field_length, status)
       let row = list.reverse([field, ..row])
       Ok(list.reverse([row, ..rows]))

From 2398cd779ca7c19f5194a3d2264cd5a52deb7cf3 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 15:46:39 +0200
Subject: [PATCH 06/23] use explicit newline in readme example

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a30dbfc..1c8570b 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,9 @@ but if you're looking for that now, I'd recommend doing ffi to an existing parse
 import gsv.{Unix, Windows}
 
 pub fn main() {
-  let csv_str = "Hello, World\nGoodbye, Mars"
+  let csv_str =
+    "Hello,World
+Goodbye,Mars"
 
   // Parse a CSV string to a List(List(String))
   let assert Ok(records) = gsv.to_lists(csv_str)

From 0a0c592906492fd9a7e4aec05202f9b11570dcf9 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 15:47:14 +0200
Subject: [PATCH 07/23] use new implementation for to_lists

---
 src/gsv.gleam | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index b3dafa1..b467ded 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -1,10 +1,9 @@
 import gleam/dict.{type Dict}
-import gleam/int
+import gleam/io
 import gleam/list
 import gleam/result
 import gleam/string
-import gsv/internal/ast.{ParseError}
-import gsv/internal/token.{Location}
+import gsv/internal/parse
 
 /// Parses a csv string to a list of lists of strings.
 /// Automatically handles Windows and Unix line endings.
@@ -12,19 +11,10 @@ import gsv/internal/token.{Location}
 /// Unquoted strings are trimmed, while quoted strings have leading and trailing
 /// whitespace preserved.
 pub fn to_lists(input: String) -> Result(List(List(String)), String) {
-  input
-  |> token.scan
-  |> token.with_location
-  |> ast.parse
-  |> result.map_error(fn(e) {
-    let ParseError(Location(line, column), msg) = e
-    "["
-    <> "line "
-    <> int.to_string(line)
-    <> " column "
-    <> int.to_string(column)
-    <> "] of csv: "
-    <> msg
+  parse.parse(input)
+  |> result.map_error(fn(error) {
+    io.debug(error)
+    todo as "decide what to do with errors"
   })
 }
 

From beb1617a3036288cb6b56dd4f12a74f1c6c55d2d Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 15:47:25 +0200
Subject: [PATCH 08/23] bunch of tests

---
 test/gsv_test.gleam | 216 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 162 insertions(+), 54 deletions(-)

diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index a3a4afc..ab40085 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -1,89 +1,135 @@
 import gleam/dict
+import gleam/string
 import gleeunit
 import gleeunit/should
-import gsv.{Unix, Windows}
+import gsv.{type LineEnding, Unix, Windows}
 
 pub fn main() {
   gleeunit.main()
 }
 
+// --- LISTS PARSING -----------------------------------------------------------
+
 pub fn csv_parse_test() {
-  "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE"
+  "Ben,25,true
+Austin,25,false"
   |> gsv.to_lists
-  |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]]))
+  |> should.be_ok
+  |> should.equal([["Ben", "25", "true"], ["Austin", "25", "false"]])
 }
 
-pub fn parse_crlf_test() {
-  "test\ntest\r\ntest"
+pub fn csv_with_crlf_test() {
+  "Ben,25,true\r
+Austin,25,false"
   |> gsv.to_lists
-  |> should.equal(Ok([["test"], ["test"], ["test"]]))
+  |> should.be_ok
+  |> should.equal([["Ben", "25", "true"], ["Austin", "25", "false"]])
 }
 
-pub fn parse_lfcr_fails_test() {
-  "test\n\r"
+pub fn csv_with_mixed_newline_kinds_test() {
+  "one
+two\r
+three"
   |> gsv.to_lists
-  |> should.be_error
+  |> should.equal(Ok([["one"], ["two"], ["three"]]))
 }
 
-pub fn last_line_has_optional_line_ending_test() {
-  "test\ntest\r\ntest\n"
+pub fn whitespace_is_not_trimmed_from_fields_test() {
+  "Ben , 25 , true
+Austin , 25 , false"
   |> gsv.to_lists
-  |> should.equal(Ok([["test"], ["test"], ["test"]]))
+  |> should.be_ok
+  |> should.equal([["Ben ", " 25 ", " true"], ["Austin ", " 25 ", " false"]])
 }
 
-// ---------------------------------------------------------------------------
+pub fn empty_lines_are_ignored_test() {
+  "
+one
 
-pub fn encode_test() {
-  let assert Ok(lls) = gsv.to_lists("Ben, 25\nAustin, 21")
-  lls
-  |> gsv.from_lists(separator: ",", line_ending: Unix)
-  |> should.equal("Ben,25\nAustin,21")
+two\r
+\r
+three"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["one"], ["two"], ["three"]])
 }
 
-pub fn encode_with_escaped_string_test() {
-  let assert Ok(lls) =
-    "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
-    |> gsv.to_lists
+pub fn last_line_can_end_with_newline_test() {
+  "one\ntwo\n"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["one"], ["two"]])
+}
 
-  lls
-  |> gsv.from_lists(separator: ",", line_ending: Unix)
-  |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\nAustin,25,FALSE")
+pub fn empty_fields_test() {
+  "one,,three"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["one", "", "three"]])
 }
 
-pub fn encode_with_escaped_string_windows_test() {
-  let assert Ok(lls) =
-    "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
-    |> gsv.to_lists
+pub fn csv_ending_with_an_empty_field_test() {
+  "one,two,"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["one", "two", ""]])
+}
 
-  lls
-  |> gsv.from_lists(separator: ",", line_ending: Windows)
-  |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE")
+pub fn csv_starting_with_an_empty_field_test() {
+  ",two,three"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["", "two", "three"]])
 }
 
-pub fn for_the_readme_test() {
-  let csv_str = "Hello, World\nGoodbye, Mars"
+pub fn escaped_field_test() {
+  "'gleam','functional'
+'erlang','functional'"
+  // Writing and escaping the double quotes by hand is a bit noisy and makes it
+  // hard to read the literal string so I prefer to write single quotes
+  // and replace those before parsing :P
+  |> string.replace(each: "'", with: "\"")
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["gleam", "functional"], ["erlang", "functional"]])
+}
 
-  // Parse a CSV string to a List(List(String))
-  let assert Ok(records) = gsv.to_lists(csv_str)
+pub fn escaped_field_with_newlines_test() {
+  "'wibble
+wobble','wibble'"
+  |> string.replace(each: "'", with: "\"")
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["wibble\nwobble", "wibble"]])
+}
 
-  // Write a List(List(String)) to a CSV string
-  records
-  |> gsv.from_lists(separator: ",", line_ending: Windows)
-  |> should.equal("Hello,World\r\nGoodbye,Mars")
+pub fn escaped_field_with_crlf_test() {
+  "'wibble\r
+wobble','wibble'"
+  |> string.replace(each: "'", with: "\"")
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["wibble\r\nwobble", "wibble"]])
 }
 
-pub fn totally_doesnt_error_test() {
-  "Ben, 25,, TRUE"
+pub fn escaped_field_with_comma_test() {
+  "'wibble,wobble','wibble'"
+  |> string.replace(each: "'", with: "\"")
   |> gsv.to_lists
-  |> should.equal(Ok([["Ben", "25", "", "TRUE"]]))
+  |> should.be_ok
+  |> should.equal([["wibble,wobble", "wibble"]])
 }
 
-pub fn trailing_commas_fine_test() {
-  "Ben, 25, TRUE, Hello\nAustin, 25,\n"
+pub fn escaped_field_with_escaped_double_quotes_test() {
+  "'escaped double quote -> '''"
+  |> string.replace(each: "'", with: "\"")
   |> gsv.to_lists
-  |> should.equal(Ok([["Ben", "25", "TRUE", "Hello"], ["Austin", "25", ""]]))
+  |> should.be_ok
+  |> should.equal([["escaped double quote -> \""]])
 }
 
+// --- DICT PARSING ------------------------------------------------------------
+
 pub fn headers_test() {
   "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
   |> gsv.to_dicts
@@ -94,14 +140,6 @@ pub fn headers_test() {
   ])
 }
 
-pub fn dicts_round_trip_test() {
-  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
-  |> gsv.to_dicts
-  |> should.be_ok
-  |> gsv.from_dicts(",", Unix)
-  |> should.equal("age,name\n27,Ben\n27,Austin")
-}
-
 pub fn dicts_with_empty_str_header_test() {
   "name,\"  \",   ,,age\nBen,foo,bar,baz,27,extra_data"
   |> gsv.to_dicts
@@ -146,3 +184,73 @@ pub fn quotes_test() {
     ["11/11/2024", "Apples", "7", "5"],
   ])
 }
+
+// --- TESTING ERRORS ----------------------------------------------------------
+
+pub fn double_quote_in_middle_of_field_test() {
+  "field,other\"field"
+  |> gsv.to_lists
+  |> should.be_error
+  |> should.equal(todo)
+}
+
+pub fn unescaped_double_quote_in_escaped_field_test() {
+  "'unescaped double quote -> ' in escaped field'"
+  |> string.replace(each: "'", with: "\"")
+  |> gsv.to_lists
+  |> should.be_error
+  |> should.equal(todo)
+}
+
+pub fn unescaped_carriage_return_test() {
+  todo as "decide what to do"
+  "test\n\r"
+  |> gsv.to_lists
+  |> should.be_error
+}
+
+// --- ENCODING TESTS ----------------------------------------------------------
+
+pub fn encode_test() {
+  "Ben, 25
+Austin, 21"
+  |> test_lists_roundtrip(",", Unix)
+}
+
+pub fn encode_with_escaped_string_test() {
+  "Ben, 25,' TRUE
+\r'' '
+Austin, 25, FALSE"
+  |> string.replace(each: "'", with: "\"")
+  |> test_lists_roundtrip(",", Unix)
+}
+
+pub fn encode_with_escaped_string_windows_test() {
+  let assert Ok(lls) =
+    "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
+    |> gsv.to_lists
+
+  lls
+  |> gsv.from_lists(separator: ",", line_ending: Windows)
+  |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE")
+}
+
+pub fn dicts_round_trip_test() {
+  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
+  |> gsv.to_dicts
+  |> should.be_ok
+  |> gsv.from_dicts(",", Unix)
+  |> should.equal("age,name\n27,Ben\n27,Austin")
+}
+
+// --- TEST HELPERS ------------------------------------------------------------
+
+fn test_lists_roundtrip(
+  input: String,
+  separator: String,
+  line_ending: LineEnding,
+) -> Nil {
+  let assert Ok(parsed) = gsv.to_lists(input)
+  let encoded = gsv.from_lists(parsed, separator, line_ending)
+  encoded |> should.equal(input)
+}

From 2d7cebd89457937f95c2bd7c116b0a9f82df1ad1 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 16:05:02 +0200
Subject: [PATCH 09/23] documentation for to_lists

---
 src/gsv.gleam | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index b467ded..1990eef 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -5,11 +5,24 @@ import gleam/result
 import gleam/string
 import gsv/internal/parse
 
-/// Parses a csv string to a list of lists of strings.
-/// Automatically handles Windows and Unix line endings.
-/// Returns a string error msg if the string is not valid csv.
-/// Unquoted strings are trimmed, while quoted strings have leading and trailing
-/// whitespace preserved.
+/// Parses a csv string into a list of lists of strings.
+/// ## Examples
+///
+/// ```gleam
+/// "hello, world
+/// goodbye, mars
+/// "
+/// |> gsv.to_lists
+/// // [["hello", " world"], ["goodbye", " mars"]]
+/// ```
+///
+/// > This implementation tries to stick as closely as possible to
+/// > [RFC4180](https://www.ietf.org/rfc/rfc4180.txt), with a couple notable
+/// > convenience differences:
+/// > - both `\n` and `\r\n` line endings are accepted.
+/// > - a line can start with an empty field `,two,three`.
+/// > - empty lines are allowed and just ignored.
+///
 pub fn to_lists(input: String) -> Result(List(List(String)), String) {
   parse.parse(input)
   |> result.map_error(fn(error) {

From 4f290960e6cf691ce0382184c6c428ebdc4987ad Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 16:11:50 +0200
Subject: [PATCH 10/23] remove old implementation

---
 src/gsv/internal/token.gleam | 103 -----------------------------------
 1 file changed, 103 deletions(-)
 delete mode 100644 src/gsv/internal/token.gleam

diff --git a/src/gsv/internal/token.gleam b/src/gsv/internal/token.gleam
deleted file mode 100644
index 98f2aef..0000000
--- a/src/gsv/internal/token.gleam
+++ /dev/null
@@ -1,103 +0,0 @@
-//// We are using the following grammar for CSV from rfc4180
-////
-//// file = [header CRLF] record *(CRLF record) [CRLF]
-////   header = name *(COMMA name)
-////  record = field *(COMMA field)
-////  name = field
-////  field = (escaped / non-escaped)
-////  escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE
-////  non-escaped = *TEXTDATA
-
-import gleam/list
-import gleam/string
-
-pub type CsvToken {
-  Comma
-  LF
-  CR
-  Doublequote
-  Textdata(inner: String)
-}
-
-pub type Location {
-  Location(line: Int, column: Int)
-}
-
-pub fn to_lexeme(token: CsvToken) -> String {
-  case token {
-    Comma -> ","
-    LF -> "\n"
-    CR -> "\r"
-    Doublequote -> "\""
-    Textdata(str) -> str
-  }
-}
-
-fn len(token: CsvToken) -> Int {
-  case token {
-    Comma -> 1
-    LF -> 1
-    CR -> 1
-    Doublequote -> 1
-    Textdata(str) -> string.length(str)
-  }
-}
-
-pub fn scan(input: String) -> List(CsvToken) {
-  input
-  |> string.to_utf_codepoints
-  |> list.fold([], fn(acc, x) {
-    case string.utf_codepoint_to_int(x) {
-      0x2c -> [Comma, ..acc]
-      0x22 -> [Doublequote, ..acc]
-      0x0a -> [LF, ..acc]
-      0x0D -> [CR, ..acc]
-      _ -> {
-        let cp = string.from_utf_codepoints([x])
-        case acc {
-          [Textdata(str), ..rest] -> [Textdata(str <> cp), ..rest]
-          _ -> [Textdata(cp), ..acc]
-        }
-      }
-    }
-  })
-  |> list.reverse
-}
-
-pub fn with_location(input: List(CsvToken)) -> List(#(CsvToken, Location)) {
-  do_with_location(input, [], Location(1, 1))
-  |> list.reverse
-}
-
-fn do_with_location(
-  input: List(CsvToken),
-  acc: List(#(CsvToken, Location)),
-  curr_loc: Location,
-) -> List(#(CsvToken, Location)) {
-  let Location(line, column) = curr_loc
-  case input {
-    // Base case, no more tokens
-    [] -> acc
-
-    // A newline, increment line number
-    [LF, ..rest] -> {
-      do_with_location(rest, [#(LF, curr_loc), ..acc], Location(line + 1, 1))
-    }
-    [CR, LF, ..rest] -> {
-      do_with_location(
-        rest,
-        [#(LF, Location(line, column + 1)), #(CR, curr_loc), ..acc],
-        Location(line + 1, 1),
-      )
-    }
-
-    // Any other token just increment the column
-    [token, ..rest] -> {
-      do_with_location(
-        rest,
-        [#(token, curr_loc), ..acc],
-        Location(line, column + len(token)),
-      )
-    }
-  }
-}

From 26407da35a011755abac3e3883e77f2d57723205 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 16:18:54 +0200
Subject: [PATCH 11/23] one big file good

---
 src/gsv.gleam                | 275 +++++++++++++++++++++++++++++++++--
 src/gsv/internal/parse.gleam | 261 ---------------------------------
 2 files changed, 266 insertions(+), 270 deletions(-)
 delete mode 100644 src/gsv/internal/parse.gleam

diff --git a/src/gsv.gleam b/src/gsv.gleam
index 1990eef..07b0eec 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -1,9 +1,31 @@
 import gleam/dict.{type Dict}
-import gleam/io
 import gleam/list
 import gleam/result
 import gleam/string
-import gsv/internal/parse
+
+// --- TYPES -------------------------------------------------------------------
+
+pub type ParseError {
+  /// A field can contain a double quote only if it is escaped (that is,
+  /// surrounded by double quotes). For example `wobb"le` would be an invalid
+  /// field, the correct way to write such a field would be like this:
+  /// `"wobb""le"`.
+  ///
+  UnescapedQuote(
+    /// The byte index of the unescaped double.
+    position: Int,
+  )
+
+  /// This error can occur when the file ends without the closing `"` of an
+  /// escaped field. For example: `"hello`.
+  ///
+  UnclosedEscapedField(
+    /// The byte index of the start of the unclosed escaped field.
+    start: Int,
+  )
+}
+
+// --- PARSING -----------------------------------------------------------------
 
 /// Parses a csv string into a list of lists of strings.
 /// ## Examples
@@ -23,12 +45,231 @@ import gsv/internal/parse
 /// > - a line can start with an empty field `,two,three`.
 /// > - empty lines are allowed and just ignored.
 ///
-pub fn to_lists(input: String) -> Result(List(List(String)), String) {
-  parse.parse(input)
-  |> result.map_error(fn(error) {
-    io.debug(error)
-    todo as "decide what to do with errors"
-  })
+pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
+  case input {
+    // We just ignore all unescaped newlines at the beginning of a file.
+    "\n" <> rest | "\r\n" <> rest -> to_lists(rest)
+    // If it starts with a `"` then we know it starts with an escaped field.
+    "\"" <> rest -> do_parse(rest, input, 1, 0, [], [], ParsingEscapedField)
+    // If it starts with a `,` then it starts with an empty field we're filling
+    // out manually.
+    "," <> rest -> do_parse(rest, input, 1, 0, [""], [], CommaFound)
+    // Otherwise we just start parsing the first unescaped field.
+    _ -> do_parse(input, input, 0, 0, [], [], ParsingUnescapedField)
+  }
+}
+
+/// This is used to keep track of what the parser is doing.
+///
+type ParseStatus {
+  ParsingEscapedField
+  ParsingUnescapedField
+  CommaFound
+  NewlineFound
+}
+
+/// ## What does this scary looking function do?
+///
+/// At a high level, it goes over the csv `string` byte-by-byte and parses rows
+/// accumulating those into `rows` as it goes.
+///
+///
+/// ## Why does it have all these parameters? What does each one do?
+///
+/// In order to be extra efficient this function parses the csv file in a single
+/// pass and uses string slicing to avoid copying data.
+/// Each time we see a new field we keep track of the byte where it starts with
+/// `field_start` and then count the bytes (that's the `field_length` variable)
+/// until we fiend its end (either a newline, the end of the file, or a `,`).
+///
+/// After reaching the end of a field we extract it from the original string
+/// taking a slice that goes from `field_start` and has `field_length` bytes.
+/// This is where the magic happens: slicing a string this way is a constant
+/// time operation and doesn't copy the string so it's crazy fast!
+///
+/// `row` is an accumulator with all the fields of the current row as
+/// they are parsed. Once we run into a newline `current_row` is added to all
+/// the other `rows`.
+///
+/// We also keep track of _what_ we're parsing with the `status` to make
+/// sure that we're correctly dealing with escaped fields and double quotes.
+///
+fn do_parse(
+  string: String,
+  original: String,
+  field_start: Int,
+  field_length: Int,
+  row: List(String),
+  rows: List(List(String)),
+  status: ParseStatus,
+) -> Result(List(List(String)), ParseError) {
+  case string, status {
+    // If we find a comma we're done with the current field and can take a slice
+    // going from `field_start` with `field_length` bytes:
+    //
+    //     wibble,wobble,...
+    //     ╰────╯ field_length = 6
+    //     ┬
+    //     ╰ field_start
+    //
+    // After taking the slice we move the slice start _after_ the comma:
+    //
+    //     wibble,wobble,...
+    //            ┬
+    //            ╰ field_start = field_start + field_length + 1 (the comma)
+    //
+    "," <> rest, CommaFound
+    | "," <> rest, NewlineFound
+    | "," <> rest, ParsingUnescapedField
+    -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = [field, ..row]
+      let field_start = field_start + field_length + 1
+      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
+    }
+    "\"," <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = [field, ..row]
+      let field_start = field_start + field_length + 2
+      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
+    }
+
+    // When the string is over we're done parsing.
+    // We take the final field we were in the middle of parsing and add it to
+    // the current row that is returned together with all the parsed rows.
+    //
+    "", ParsingUnescapedField | "\"", ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      Ok(list.reverse([row, ..rows]))
+    }
+
+    "", CommaFound -> {
+      let row = list.reverse(["", ..row])
+      Ok(list.reverse([row, ..rows]))
+    }
+
+    "", NewlineFound -> Ok(list.reverse(rows))
+
+    // If the string is over and we were parsing an escaped field, that's an
+    // error. We would expect to find a closing double quote before the end of
+    // the data.
+    //
+    "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start))
+
+    // When we run into a new line (CRLF or just LF) we know we're done with the
+    // current field and take a slice of it, just like we did in the previous
+    // branch!
+    // The only difference is we also add the current `row` to all the other
+    // ones and start with a new one.
+    //
+    // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF.
+    // > Here we do something slightly different and also accept lines that are
+    // > delimited by just LF too.
+    //
+    // The next three branches are the same except for the new `field_start`
+    // that has to take into account the different lengths.
+    // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking
+    // adding the lenght of that but it had a noticeable (albeit small) impact
+    // on performance.
+    //
+    "\n" <> rest, ParsingUnescapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 1
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+    "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 2
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+    "\"\r\n" <> rest, ParsingEscapedField -> {
+      let field = extract_field(original, field_start, field_length, status)
+      let row = list.reverse([field, ..row])
+      let rows = [row, ..rows]
+      let field_start = field_start + field_length + 3
+      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
+    }
+
+    // If the newlines is immediately after a comma then the row ends with an
+    // empty field.
+    //
+    "\n" <> rest, CommaFound -> {
+      let row = list.reverse(["", ..row])
+      let rows = [row, ..rows]
+      do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound)
+    }
+    "\r\n" <> rest, CommaFound -> {
+      let row = list.reverse(["", ..row])
+      let rows = [row, ..rows]
+      do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound)
+    }
+
+    // If the newline immediately comes after a newline that means we've run
+    // into an empty line that we can just safely ignore.
+    //
+    "\n" <> rest, NewlineFound ->
+      do_parse(rest, original, field_start + 1, 0, row, rows, status)
+    "\r\n" <> rest, NewlineFound ->
+      do_parse(rest, original, field_start + 2, 0, row, rows, status)
+
+    // An escaped quote found while parsing an escaped field.
+    //
+    "\"\"" <> rest, ParsingEscapedField ->
+      do_parse(rest, original, field_start, field_length + 2, row, rows, status)
+
+    // An unescaped quote found while parsing a field.
+    //
+    "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField ->
+      Error(UnescapedQuote(position: field_start + field_length))
+
+    // If the quote is found immediately after a comma or a newline that signals
+    // the start of a new escaped field to parse.
+    //
+    "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> {
+      let status = ParsingEscapedField
+      do_parse(rest, original, field_start + 1, 0, row, rows, status)
+    }
+
+    // In all other cases we're still parsing a field so we just drop a byte
+    // from the string we're iterating through, increase the size of the slice
+    // we need to take and keep going.
+    //
+    // > ⚠️ Notice how we're not trying to trim any whitespaces at the
+    // > beginning or end of a field: RFC 4810 states that "Spaces are
+    // > considered part of a field and should not be ignored."
+    //
+    _, CommaFound
+    | _, NewlineFound
+    | _, ParsingUnescapedField
+    | _, ParsingEscapedField
+    -> {
+      let status = case status {
+        ParsingEscapedField -> ParsingEscapedField
+        CommaFound | NewlineFound | ParsingUnescapedField ->
+          ParsingUnescapedField
+      }
+      let rest = drop_bytes(string, 1)
+      do_parse(rest, original, field_start, field_length + 1, row, rows, status)
+    }
+  }
+}
+
+fn extract_field(
+  string: String,
+  from: Int,
+  length: Int,
+  status: ParseStatus,
+) -> String {
+  let field = slice_bytes(string, from, length)
+  case status {
+    CommaFound | ParsingUnescapedField | NewlineFound -> field
+    ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
+  }
 }
 
 /// Parses a csv string to a list of dicts.
@@ -39,7 +280,7 @@ pub fn to_lists(input: String) -> Result(List(List(String)), String) {
 /// Whitespace only or empty strings are not valid headers and will be ignored.
 /// Whitespace only or empty strings are not considered "present" in the csv row and
 /// are not inserted into the row dict.
-pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) {
+pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) {
   use lol <- result.try(to_lists(input))
   case lol {
     [] -> []
@@ -149,3 +390,19 @@ pub fn from_dicts(
     }
   }
 }
+
+// --- FFI HELPERS -------------------------------------------------------------
+
+/// In general this wouldn't be safe, by just slicing random bytes in the middle
+/// of a utf8 string we might end up with something that is not a valid utf8
+/// string.
+/// However, the parser only slices fields in between commas so it should always
+/// yield valid utf8 slices.
+///
+@external(erlang, "gsv_ffi", "slice")
+@external(javascript, "../gsv_ffi.mjs", "slice")
+fn slice_bytes(string: String, from: Int, length: Int) -> String
+
+@external(erlang, "gsv_ffi", "drop_bytes")
+@external(javascript, "../gsv_ffi.mjs", "drop_bytes")
+fn drop_bytes(string: String, bytes: Int) -> String
diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam
deleted file mode 100644
index 2c54d3d..0000000
--- a/src/gsv/internal/parse.gleam
+++ /dev/null
@@ -1,261 +0,0 @@
-import gleam/list
-import gleam/string
-
-pub fn parse(string) -> Result(List(List(String)), ParseError) {
-  case string {
-    // We just ignore all unescaped newlines at the beginning of a file.
-    "\n" <> rest | "\r\n" <> rest -> parse(rest)
-    // If it starts with a `"` then we know it starts with an escaped field.
-    "\"" <> rest -> do_parse(rest, string, 1, 0, [], [], ParsingEscapedField)
-    // If it starts with a `,` then it starts with an empty field we're filling
-    // out manually.
-    "," <> rest -> do_parse(rest, string, 1, 0, [""], [], CommaFound)
-    // Otherwise we just start parsing the first unescaped field.
-    _ -> do_parse(string, string, 0, 0, [], [], ParsingUnescapedField)
-  }
-}
-
-pub type ParseError {
-  /// A field can contain a double quote only if it is escaped (that is,
-  /// surrounded by double quotes). For example `wobb"le` would be an invalid
-  /// field, the correct way to write such a field would be like this:
-  /// `"wobb""le"`.
-  ///
-  UnescapedQuote(
-    /// The byte index of the unescaped double.
-    position: Int,
-  )
-
-  /// This error can occur when the file ends without the closing `"` of an
-  /// escaped field. For example: `"hello`.
-  ///
-  UnclosedEscapedField(
-    /// The byte index of the start of the unclosed escaped field.
-    start: Int,
-  )
-}
-
-type ParseStatus {
-  ParsingEscapedField
-  ParsingUnescapedField
-  CommaFound
-  NewlineFound
-}
-
-/// ## What does this scary looking function do?
-///
-/// At a high level, it goes over the csv `string` byte-by-byte and parses rows
-/// accumulating those into `rows` as it goes.
-///
-///
-/// ## Why does it have all these parameters? What does each one do?
-///
-/// In order to be extra efficient this function parses the csv file in a single
-/// pass and uses string slicing to avoid copying data.
-/// Each time we see a new field we keep track of the byte where it starts with
-/// `field_start` and then count the bytes (that's the `field_length` variable)
-/// until we fiend its end (either a newline, the end of the file, or a `,`).
-///
-/// After reaching the end of a field we extract it from the original string
-/// taking a slice that goes from `field_start` and has `field_length` bytes.
-/// This is where the magic happens: slicing a string this way is a constant
-/// time operation and doesn't copy the string so it's crazy fast!
-///
-/// `row` is an accumulator with all the fields of the current row as
-/// they are parsed. Once we run into a newline `current_row` is added to all
-/// the other `rows`.
-///
-/// We also keep track of _what_ we're parsing with the `status` to make
-/// sure that we're correctly dealing with escaped fields and double quotes.
-///
-fn do_parse(
-  string: String,
-  original: String,
-  field_start: Int,
-  field_length: Int,
-  row: List(String),
-  rows: List(List(String)),
-  status: ParseStatus,
-) -> Result(List(List(String)), ParseError) {
-  case string, status {
-    // If we find a comma we're done with the current field and can take a slice
-    // going from `field_start` with `field_length` bytes:
-    //
-    //     wibble,wobble,...
-    //     ╰────╯ field_length = 6
-    //     ┬
-    //     ╰ field_start
-    //
-    // After taking the slice we move the slice start _after_ the comma:
-    //
-    //     wibble,wobble,...
-    //            ┬
-    //            ╰ field_start = field_start + field_length + 1 (the comma)
-    //
-    "," <> rest, CommaFound
-    | "," <> rest, NewlineFound
-    | "," <> rest, ParsingUnescapedField
-    -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = [field, ..row]
-      let field_start = field_start + field_length + 1
-      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
-    }
-    "\"," <> rest, ParsingEscapedField -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = [field, ..row]
-      let field_start = field_start + field_length + 2
-      do_parse(rest, original, field_start, 0, row, rows, CommaFound)
-    }
-
-    // When the string is over we're done parsing.
-    // We take the final field we were in the middle of parsing and add it to
-    // the current row that is returned together with all the parsed rows.
-    //
-    "", ParsingUnescapedField | "\"", ParsingEscapedField -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = list.reverse([field, ..row])
-      Ok(list.reverse([row, ..rows]))
-    }
-
-    "", CommaFound -> {
-      let row = list.reverse(["", ..row])
-      Ok(list.reverse([row, ..rows]))
-    }
-
-    "", NewlineFound -> Ok(list.reverse(rows))
-
-    // If the string is over and we were parsing an escaped field, that's an
-    // error. We would expect to find a closing double quote before the end of
-    // the data.
-    //
-    "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start))
-
-    // When we run into a new line (CRLF or just LF) we know we're done with the
-    // current field and take a slice of it, just like we did in the previous
-    // branch!
-    // The only difference is we also add the current `row` to all the other
-    // ones and start with a new one.
-    //
-    // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF.
-    // > Here we do something slightly different and also accept lines that are
-    // > delimited by just LF too.
-    //
-    // The next three branches are the same except for the new `field_start`
-    // that has to take into account the different lengths.
-    // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking
-    // adding the lenght of that but it had a noticeable (albeit small) impact
-    // on performance.
-    //
-    "\n" <> rest, ParsingUnescapedField -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = list.reverse([field, ..row])
-      let rows = [row, ..rows]
-      let field_start = field_start + field_length + 1
-      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
-    }
-    "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = list.reverse([field, ..row])
-      let rows = [row, ..rows]
-      let field_start = field_start + field_length + 2
-      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
-    }
-    "\"\r\n" <> rest, ParsingEscapedField -> {
-      let field = extract_field(original, field_start, field_length, status)
-      let row = list.reverse([field, ..row])
-      let rows = [row, ..rows]
-      let field_start = field_start + field_length + 3
-      do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
-    }
-
-    // If the newlines is immediately after a comma then the row ends with an
-    // empty field.
-    //
-    "\n" <> rest, CommaFound -> {
-      let row = list.reverse(["", ..row])
-      let rows = [row, ..rows]
-      do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound)
-    }
-    "\r\n" <> rest, CommaFound -> {
-      let row = list.reverse(["", ..row])
-      let rows = [row, ..rows]
-      do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound)
-    }
-
-    // If the newline immediately comes after a newline that means we've run
-    // into an empty line that we can just safely ignore.
-    //
-    "\n" <> rest, NewlineFound ->
-      do_parse(rest, original, field_start + 1, 0, row, rows, status)
-    "\r\n" <> rest, NewlineFound ->
-      do_parse(rest, original, field_start + 2, 0, row, rows, status)
-
-    // An escaped quote found while parsing an escaped field.
-    //
-    "\"\"" <> rest, ParsingEscapedField ->
-      do_parse(rest, original, field_start, field_length + 2, row, rows, status)
-
-    // An unescaped quote found while parsing a field.
-    //
-    "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField ->
-      Error(UnescapedQuote(position: field_start + field_length))
-
-    // If the quote is found immediately after a comma or a newline that signals
-    // the start of a new escaped field to parse.
-    //
-    "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> {
-      let status = ParsingEscapedField
-      do_parse(rest, original, field_start + 1, 0, row, rows, status)
-    }
-
-    // In all other cases we're still parsing a field so we just drop a byte
-    // from the string we're iterating through, increase the size of the slice
-    // we need to take and keep going.
-    //
-    // > ⚠️ Notice how we're not trying to trim any whitespaces at the
-    // > beginning or end of a field: RFC 4810 states that "Spaces are
-    // > considered part of a field and should not be ignored."
-    //
-    _, CommaFound
-    | _, NewlineFound
-    | _, ParsingUnescapedField
-    | _, ParsingEscapedField
-    -> {
-      let status = case status {
-        ParsingEscapedField -> ParsingEscapedField
-        CommaFound | NewlineFound | ParsingUnescapedField ->
-          ParsingUnescapedField
-      }
-      let rest = drop_bytes(string, 1)
-      do_parse(rest, original, field_start, field_length + 1, row, rows, status)
-    }
-  }
-}
-
-fn extract_field(
-  string: String,
-  from: Int,
-  length: Int,
-  status: ParseStatus,
-) -> String {
-  let field = slice_bytes(string, from, length)
-  case status {
-    CommaFound | ParsingUnescapedField | NewlineFound -> field
-    ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
-  }
-}
-
-/// In general this wouldn't be safe, by just slicing random bytes in the middle
-/// of a utf8 string we might end up with something that is not a valid utf8
-/// string.
-/// However, the parser only slices fields in between commas so it should always
-/// yield valid utf8 slices.
-///
-@external(erlang, "gsv_ffi", "slice")
-@external(javascript, "../../gsv_ffi.mjs", "slice")
-fn slice_bytes(string: String, from: Int, length: Int) -> String
-
-@external(erlang, "gsv_ffi", "drop_bytes")
-@external(javascript, "../../gsv_ffi.mjs", "drop_bytes")
-fn drop_bytes(string: String, bytes: Int) -> String

From eab6f8c779d73c7dedbb5f9aa416502d97d6e311 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sat, 19 Oct 2024 16:53:38 +0200
Subject: [PATCH 12/23] nice error tests

---
 .../double_quote_in_middle_of_field.accepted  |  9 ++
 .../unclosed_escaped_field.accepted           |  9 ++
 ...ped_double_quote_in_escaped_field.accepted |  9 ++
 gleam.toml                                    |  1 +
 manifest.toml                                 | 16 ++++
 test/gsv_test.gleam                           | 84 +++++++++++++++++--
 6 files changed, 122 insertions(+), 6 deletions(-)
 create mode 100644 birdie_snapshots/double_quote_in_middle_of_field.accepted
 create mode 100644 birdie_snapshots/unclosed_escaped_field.accepted
 create mode 100644 birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted

diff --git a/birdie_snapshots/double_quote_in_middle_of_field.accepted b/birdie_snapshots/double_quote_in_middle_of_field.accepted
new file mode 100644
index 0000000..05bd5bb
--- /dev/null
+++ b/birdie_snapshots/double_quote_in_middle_of_field.accepted
@@ -0,0 +1,9 @@
+---
+version: 1.2.3
+title: double quote in middle of field
+file: ./test/gsv_test.gleam
+test_name: double_quote_in_middle_of_field_test
+---
+field,other"field
+           ┬
+           ╰─ This is an unescaped double quote
\ No newline at end of file
diff --git a/birdie_snapshots/unclosed_escaped_field.accepted b/birdie_snapshots/unclosed_escaped_field.accepted
new file mode 100644
index 0000000..9407731
--- /dev/null
+++ b/birdie_snapshots/unclosed_escaped_field.accepted
@@ -0,0 +1,9 @@
+---
+version: 1.2.3
+title: unclosed escaped field
+file: ./test/gsv_test.gleam
+test_name: unclosed_escaped_field_test
+---
+"closed","unclosed
+          ┬
+          ╰─ This escaped field is not closed
\ No newline at end of file
diff --git a/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted b/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted
new file mode 100644
index 0000000..d8a67da
--- /dev/null
+++ b/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted
@@ -0,0 +1,9 @@
+---
+version: 1.2.3
+title: unescaped double quote in escaped field
+file: ./test/gsv_test.gleam
+test_name: unescaped_double_quote_in_escaped_field_test
+---
+"unescaped double quote -> " in escaped field"
+                           ┬
+                           ╰─ This is an unescaped double quote
\ No newline at end of file
diff --git a/gleam.toml b/gleam.toml
index 5080dd0..a9fa90a 100644
--- a/gleam.toml
+++ b/gleam.toml
@@ -16,3 +16,4 @@ gleam_stdlib = ">= 0.40.0 and < 1.0.0"
 
 [dev-dependencies]
 gleeunit = "~> 1.0"
+birdie = ">= 1.2.3 and < 2.0.0"
diff --git a/manifest.toml b/manifest.toml
index 5cfd9ac..9231034 100644
--- a/manifest.toml
+++ b/manifest.toml
@@ -2,10 +2,26 @@
 # You typically do not need to edit this file
 
 packages = [
+  { name = "argv", version = "1.0.2", build_tools = ["gleam"], requirements = [], otp_app = "argv", source = "hex", outer_checksum = "BA1FF0929525DEBA1CE67256E5ADF77A7CDDFE729E3E3F57A5BDCAA031DED09D" },
+  { name = "birdie", version = "1.2.3", build_tools = ["gleam"], requirements = ["argv", "edit_distance", "filepath", "glance", "gleam_community_ansi", "gleam_erlang", "gleam_stdlib", "justin", "rank", "simplifile", "trie_again"], otp_app = "birdie", source = "hex", outer_checksum = "AE1207210E9CC8F4170BCE3FB3C23932F314C352C3FD1BCEA44CF4BF8CF60F93" },
+  { name = "edit_distance", version = "2.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "edit_distance", source = "hex", outer_checksum = "A1E485C69A70210223E46E63985FA1008B8B2DDA9848B7897469171B29020C05" },
+  { name = "filepath", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "EFB6FF65C98B2A16378ABC3EE2B14124168C0CE5201553DE652E2644DCFDB594" },
+  { name = "glance", version = "0.11.0", build_tools = ["gleam"], requirements = ["gleam_stdlib", "glexer"], otp_app = "glance", source = "hex", outer_checksum = "8F3314D27773B7C3B9FB58D8C02C634290422CE531988C0394FA0DF8676B964D" },
+  { name = "gleam_community_ansi", version = "1.4.1", build_tools = ["gleam"], requirements = ["gleam_community_colour", "gleam_stdlib"], otp_app = "gleam_community_ansi", source = "hex", outer_checksum = "4CD513FC62523053E62ED7BAC2F36136EC17D6A8942728250A9A00A15E340E4B" },
+  { name = "gleam_community_colour", version = "1.4.0", build_tools = ["gleam"], requirements = ["gleam_json", "gleam_stdlib"], otp_app = "gleam_community_colour", source = "hex", outer_checksum = "795964217EBEDB3DA656F5EB8F67D7AD22872EB95182042D3E7AFEF32D3FD2FE" },
+  { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" },
+  { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" },
   { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" },
   { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" },
+  { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" },
+  { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" },
+  { name = "rank", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "rank", source = "hex", outer_checksum = "5660E361F0E49CBB714CC57CC4C89C63415D8986F05B2DA0C719D5642FAD91C9" },
+  { name = "simplifile", version = "2.2.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0DFABEF7DC7A9E2FF4BB27B108034E60C81BEBFCB7AB816B9E7E18ED4503ACD8" },
+  { name = "thoas", version = "1.2.1", build_tools = ["rebar3"], requirements = [], otp_app = "thoas", source = "hex", outer_checksum = "E38697EDFFD6E91BD12CEA41B155115282630075C2A727E7A6B2947F5408B86A" },
+  { name = "trie_again", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "trie_again", source = "hex", outer_checksum = "5B19176F52B1BD98831B57FDC97BD1F88C8A403D6D8C63471407E78598E27184" },
 ]
 
 [requirements]
+birdie = { version = ">= 1.2.3 and < 2.0.0" }
 gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" }
 gleeunit = { version = "~> 1.0" }
diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index ab40085..2a9e9e0 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -1,4 +1,6 @@
+import birdie
 import gleam/dict
+import gleam/list
 import gleam/string
 import gleeunit
 import gleeunit/should
@@ -189,17 +191,22 @@ pub fn quotes_test() {
 
 pub fn double_quote_in_middle_of_field_test() {
   "field,other\"field"
-  |> gsv.to_lists
-  |> should.be_error
-  |> should.equal(todo)
+  |> pretty_print_error
+  |> birdie.snap("double quote in middle of field")
 }
 
 pub fn unescaped_double_quote_in_escaped_field_test() {
   "'unescaped double quote -> ' in escaped field'"
   |> string.replace(each: "'", with: "\"")
-  |> gsv.to_lists
-  |> should.be_error
-  |> should.equal(todo)
+  |> pretty_print_error
+  |> birdie.snap("unescaped double quote in escaped field")
+}
+
+pub fn unclosed_escaped_field_test() {
+  "'closed','unclosed"
+  |> string.replace(each: "'", with: "\"")
+  |> pretty_print_error
+  |> birdie.snap("unclosed escaped field")
 }
 
 pub fn unescaped_carriage_return_test() {
@@ -254,3 +261,68 @@ fn test_lists_roundtrip(
   let encoded = gsv.from_lists(parsed, separator, line_ending)
   encoded |> should.equal(input)
 }
+
+fn pretty_print_error(input: String) -> String {
+  let assert Error(error) = gsv.to_lists(input)
+  let error_message = error_to_message(error)
+  let #(error_line, error_column) =
+    error_to_position(error)
+    |> position_to_line_and_column(in: input)
+
+  string.replace(in: input, each: "\r\n", with: "\n")
+  |> string.split(on: "\n")
+  |> list.index_map(fn(line, line_number) {
+    case line_number == error_line {
+      False -> line
+      True -> {
+        let padding = string.repeat(" ", error_column)
+        let pointer_line = padding <> "┬"
+        let message_line = padding <> "╰─ " <> error_message
+        line <> "\n" <> pointer_line <> "\n" <> message_line
+      }
+    }
+  })
+  |> string.join(with: "\n")
+}
+
+fn error_to_position(error: gsv.ParseError) -> Int {
+  case error {
+    gsv.UnclosedEscapedField(position) | gsv.UnescapedQuote(position) ->
+      position
+  }
+}
+
+fn error_to_message(error: gsv.ParseError) -> String {
+  case error {
+    gsv.UnclosedEscapedField(_) -> "This escaped field is not closed"
+    gsv.UnescapedQuote(_) -> "This is an unescaped double quote"
+  }
+}
+
+fn position_to_line_and_column(position: Int, in string: String) -> #(Int, Int) {
+  do_position_to_line_and_column(string, position, 0, 0)
+}
+
+fn do_position_to_line_and_column(
+  string: String,
+  position: Int,
+  line: Int,
+  col: Int,
+) -> #(Int, Int) {
+  case position, string {
+    0, _ -> #(line, col)
+    _, "" -> panic as "position out of string bounds"
+    _, "\n" <> rest ->
+      do_position_to_line_and_column(rest, position - 1, line + 1, 0)
+    _, "\r\n" <> rest ->
+      do_position_to_line_and_column(rest, position - 2, line + 1, 0)
+    _, _ -> {
+      let rest = drop_bytes(string, 1)
+      do_position_to_line_and_column(rest, position - 1, line, col + 1)
+    }
+  }
+}
+
+@external(erlang, "gsv_ffi", "drop_bytes")
+@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes")
+fn drop_bytes(string: String, bytes: Int) -> String

From 8b395890d46ae0d76b116029587992493915ccf2 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 10:21:54 +0100
Subject: [PATCH 13/23] fix ffi bug

---
 src/gsv_ffi.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gsv_ffi.erl b/src/gsv_ffi.erl
index acec49e..10f3c64 100644
--- a/src/gsv_ffi.erl
+++ b/src/gsv_ffi.erl
@@ -7,5 +7,5 @@ slice(String, Index, Length) ->
 drop_bytes(String, Bytes) ->
     case String of
         <<_:Bytes/bytes, Rest/binary>> -> Rest;
-        <<>> -> <<>>
+        _ -> String
     end.

From b70c0316accf876aa702e2a21d7763db9c45c2cd Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 11:30:33 +0100
Subject: [PATCH 14/23] to_dicts

---
 gleam.toml          |   1 +
 manifest.toml       |   2 +
 src/gsv.gleam       | 152 +++++++++++++++++++++++++++++---------------
 test/gsv_test.gleam |  50 +++++++++++----
 4 files changed, 140 insertions(+), 65 deletions(-)

diff --git a/gleam.toml b/gleam.toml
index a9fa90a..1e538df 100644
--- a/gleam.toml
+++ b/gleam.toml
@@ -13,6 +13,7 @@ internal_modules = [
 
 [dependencies]
 gleam_stdlib = ">= 0.40.0 and < 1.0.0"
+glearray = ">= 1.0.0 and < 2.0.0"
 
 [dev-dependencies]
 gleeunit = "~> 1.0"
diff --git a/manifest.toml b/manifest.toml
index 9231034..bde2f68 100644
--- a/manifest.toml
+++ b/manifest.toml
@@ -12,6 +12,7 @@ packages = [
   { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" },
   { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" },
   { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" },
+  { name = "glearray", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glearray", source = "hex", outer_checksum = "B99767A9BC63EF9CC8809F66C7276042E5EFEACAA5B25188B552D3691B91AC6D" },
   { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" },
   { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" },
   { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" },
@@ -24,4 +25,5 @@ packages = [
 [requirements]
 birdie = { version = ">= 1.2.3 and < 2.0.0" }
 gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" }
+glearray = { version = ">= 1.0.0 and < 2.0.0" }
 gleeunit = { version = "~> 1.0" }
diff --git a/src/gsv.gleam b/src/gsv.gleam
index 07b0eec..f2087ab 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -2,17 +2,22 @@ import gleam/dict.{type Dict}
 import gleam/list
 import gleam/result
 import gleam/string
+import glearray
 
 // --- TYPES -------------------------------------------------------------------
 
 pub type ParseError {
+  /// This error can occur if there is a csv field contains an unescaped double
+  /// quote `"`.
+  ///
   /// A field can contain a double quote only if it is escaped (that is,
-  /// surrounded by double quotes). For example `wobb"le` would be an invalid
+  /// surrounded by double quotes). For example `wibb"le` would be an invalid
   /// field, the correct way to write such a field would be like this:
-  /// `"wobb""le"`.
+  /// `"wibb""le"`.
   ///
   UnescapedQuote(
     /// The byte index of the unescaped double.
+    ///
     position: Int,
   )
 
@@ -21,21 +26,45 @@ pub type ParseError {
   ///
   UnclosedEscapedField(
     /// The byte index of the start of the unclosed escaped field.
+    ///
     start: Int,
   )
 }
 
+/// Possible line endings used when turning a parsed csv back into a string
+/// with the `from_lists` and `from_dicts` functions.
+///
+pub type LineEnding {
+  /// The CRLF line ending: `\r\n`.
+  ///
+  Windows
+
+  /// The LF line ending: `\n`.
+  Unix
+}
+
+fn le_to_string(le: LineEnding) -> String {
+  case le {
+    Windows -> "\r\n"
+    Unix -> "\n"
+  }
+}
+
 // --- PARSING -----------------------------------------------------------------
 
-/// Parses a csv string into a list of lists of strings.
+/// Parses a csv string into a list of lists of strings: each line of the csv
+/// will be turned into a list with an item for each field.
+///
 /// ## Examples
 ///
 /// ```gleam
 /// "hello, world
-/// goodbye, mars
-/// "
+/// goodbye, mars"
 /// |> gsv.to_lists
-/// // [["hello", " world"], ["goodbye", " mars"]]
+/// // Ok([
+/// //    ["hello", " world"],
+/// //    ["goodbye", " mars"],
+/// // ])
 /// ```
 ///
 /// > This implementation tries to stick as closely as possible to
@@ -62,9 +91,22 @@ pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
 /// This is used to keep track of what the parser is doing.
 ///
 type ParseStatus {
+  /// We're in the middle of parsing an escaped csv field (that is, starting
+  /// and ending with `"`).
+  ///
   ParsingEscapedField
+
+  /// We're in the middle of parsing a regular csv field.
+  ///
   ParsingUnescapedField
+
+  /// We've just ran into a (non escaped) comma, signalling the end of a field.
+  ///
   CommaFound
+
+  /// We've just ran into a (non escaped) newline (either a `\n` or `\r\n`),
+  /// signalling the end of a line and the start of a new one.
+  ///
   NewlineFound
 }
 
@@ -268,60 +310,64 @@ fn extract_field(
   let field = slice_bytes(string, from, length)
   case status {
     CommaFound | ParsingUnescapedField | NewlineFound -> field
+    // If we were parsing an escaped field then escaped quotes must be replaced
+    // with a single one.
     ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
   }
 }
 
-/// Parses a csv string to a list of dicts.
-/// Automatically handles Windows and Unix line endings.
-/// Returns a string error msg if the string is not valid csv.
-/// Unquoted strings are trimmed, while quoted strings have leading and trailing
-/// whitespace preserved.
-/// Whitespace only or empty strings are not valid headers and will be ignored.
-/// Whitespace only or empty strings are not considered "present" in the csv row and
-/// are not inserted into the row dict.
+/// Parses a csv string into a list of dicts: the first line of the csv is
+/// interpreted as the headers' row and each of the following lines is turned
+/// into a dict with a value for each of the headers.
+///
+/// If a field is empty then it won't be added to the dict.
+///
+/// ## Examples
+///
+/// ```gleam
+/// "pet,name,cuteness
+/// dog,Fido,100
+/// cat,,1000
+/// "
+/// |> gsv.to_dicts
+/// // Ok([
+/// //    dict.from_list([
+/// //      #("pet", "dog"), #("name", "Fido"), #("cuteness", "100")
+/// //    ]),
+/// //    dict.from_list([
+/// //      #("pet", "cat"), #("cuteness", "1000")
+/// //    ]),
+/// // ])
+/// ```
+///
+/// > Just list `to_lists` this implementation tries to stick as closely as
+/// > possible to [RFC4180](https://www.ietf.org/rfc/rfc4180.txt).
+/// > You can look at `to_lists`' documentation to see how it differs from the
+/// > RFC.
+///
 pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) {
-  use lol <- result.try(to_lists(input))
-  case lol {
+  use rows <- result.map(to_lists(input))
+  case rows {
     [] -> []
     [headers, ..rows] -> {
-      let headers =
-        list.index_fold(headers, dict.new(), fn(acc, x, i) {
-          case string.trim(x) == "" {
-            True -> acc
-            False -> dict.insert(acc, i, x)
-          }
-        })
+      let headers = glearray.from_list(headers)
 
-      list.map(rows, fn(row) {
-        use acc, x, i <- list.index_fold(row, dict.new())
-        case dict.get(headers, i) {
-          Error(Nil) -> acc
-          Ok(h) ->
-            case string.trim(x) {
-              "" -> acc
-              t -> dict.insert(acc, string.trim(h), t)
-            }
-        }
-      })
+      use row <- list.map(rows)
+      use row, field, index <- list.index_fold(row, dict.new())
+      case field {
+        // If the field is empty then we don't add it to the row's dict.
+        "" -> row
+        _ ->
+          // We look for the header corresponding to this field's position.
+          case glearray.get(headers, index) {
+            Ok(header) -> dict.insert(row, header, field)
+            // This could happen if the row has more fields than headers in the
+            // header row, in this case the field is just discarded
+            Error(_) -> row
+          }
+      }
     }
   }
-  |> Ok
-}
-
-/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
-/// line endings. Use with the `from_lists` function when
-/// writing to a csv string.
-pub type LineEnding {
-  Windows
-  Unix
-}
-
-fn le_to_string(le: LineEnding) -> String {
-  case le {
-    Windows -> "\r\n"
-    Unix -> "\n"
-  }
 }
 
 /// Takes a list of lists of strings and writes it to a csv string.
@@ -329,6 +375,7 @@ fn le_to_string(le: LineEnding) -> String {
 /// line endings with double quotes (in csv, double quotes get escaped by doing
 /// a double doublequote)
 /// The string `he"llo\n` becomes `"he""llo\n"`
+///
 pub fn from_lists(
   input: List(List(String)),
   separator separator: String,
@@ -360,6 +407,7 @@ pub fn from_lists(
 /// line endings with double quotes (in csv, double quotes get escaped by doing
 /// a double doublequote)
 /// The string `he"llo\n` becomes `"he""llo\n"`
+///
 pub fn from_dicts(
   input: List(Dict(String, String)),
   separator separator: String,
@@ -400,9 +448,9 @@ pub fn from_dicts(
 /// yield valid utf8 slices.
 ///
 @external(erlang, "gsv_ffi", "slice")
-@external(javascript, "../gsv_ffi.mjs", "slice")
+@external(javascript, "./gsv_ffi.mjs", "slice")
 fn slice_bytes(string: String, from: Int, length: Int) -> String
 
 @external(erlang, "gsv_ffi", "drop_bytes")
-@external(javascript, "../gsv_ffi.mjs", "drop_bytes")
+@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
 fn drop_bytes(string: String, bytes: Int) -> String
diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index 2a9e9e0..96a88b3 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -133,7 +133,9 @@ pub fn escaped_field_with_escaped_double_quotes_test() {
 // --- DICT PARSING ------------------------------------------------------------
 
 pub fn headers_test() {
-  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
+  "name,age
+Ben,27,TRUE,Hello
+Austin,27,"
   |> gsv.to_dicts
   |> should.be_ok
   |> should.equal([
@@ -143,20 +145,30 @@ pub fn headers_test() {
 }
 
 pub fn dicts_with_empty_str_header_test() {
-  "name,\"  \",   ,,age\nBen,foo,bar,baz,27,extra_data"
+  "name,\"  \",   ,,age
+Ben,wibble,wobble,woo,27,extra_data"
   |> gsv.to_dicts
   |> should.be_ok
-  |> gsv.from_dicts(",", Unix)
-  |> should.equal("age,name\n27,Ben")
+  |> should.equal([
+    dict.from_list([
+      #("name", "Ben"),
+      #("  ", "wibble"),
+      #("   ", "wobble"),
+      #("", "woo"),
+      #("age", "27"),
+    ]),
+  ])
 }
 
 pub fn dicts_with_empty_values_test() {
-  "name, age\nBen,,,,\nAustin, 27"
+  "name,age
+Ben,,,,
+Austin,27"
   |> gsv.to_dicts
   |> should.be_ok
   |> should.equal([
     dict.from_list([#("name", "Ben")]),
-    dict.from_list([#("age", "27"), #("name", "Austin")]),
+    dict.from_list([#("name", "Austin"), #("age", "27")]),
   ])
 }
 
@@ -233,21 +245,33 @@ Austin, 25, FALSE"
 }
 
 pub fn encode_with_escaped_string_windows_test() {
-  let assert Ok(lls) =
-    "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
+  let assert Ok(rows) =
+    "Ben, 25,' TRUE\n\r'' '
+Austin, 25, FALSE"
+    |> string.replace(each: "'", with: "\"")
     |> gsv.to_lists
 
-  lls
+  rows
   |> gsv.from_lists(separator: ",", line_ending: Windows)
-  |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE")
+  |> string.replace(each: "\"", with: "'")
+  |> should.equal(
+    "Ben, 25,' TRUE\n\r'' '\r
+Austin, 25, FALSE",
+  )
 }
 
 pub fn dicts_round_trip_test() {
-  "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
+  "name,age
+Ben,27,TRUE,Hello
+Austin,27,"
   |> gsv.to_dicts
   |> should.be_ok
   |> gsv.from_dicts(",", Unix)
-  |> should.equal("age,name\n27,Ben\n27,Austin")
+  |> should.equal(
+    "age,name
+27,Ben
+27,Austin",
+  )
 }
 
 // --- TEST HELPERS ------------------------------------------------------------
@@ -324,5 +348,5 @@ fn do_position_to_line_and_column(
 }
 
 @external(erlang, "gsv_ffi", "drop_bytes")
-@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes")
+@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
 fn drop_bytes(string: String, bytes: Int) -> String

From f7225099f7aca9f69789043e2e0bf15ac8acfc61 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 12:42:51 +0100
Subject: [PATCH 15/23] from_lists

---
 src/gsv.gleam       | 129 +++++++++++++++++++++++++++++++++++---------
 test/gsv_test.gleam |   7 ---
 2 files changed, 104 insertions(+), 32 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index f2087ab..199098d 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -43,7 +43,7 @@ pub type LineEnding {
   Unix
 }
 
-fn le_to_string(le: LineEnding) -> String {
+fn line_ending_to_string(le: LineEnding) -> String {
   case le {
     Windows -> "\r\n"
     Unix -> "\n"
@@ -370,36 +370,115 @@ pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError)
   }
 }
 
-/// Takes a list of lists of strings and writes it to a csv string.
-/// Will automatically escape strings that contain double quotes or
-/// line endings with double quotes (in csv, double quotes get escaped by doing
-/// a double doublequote)
-/// The string `he"llo\n` becomes `"he""llo\n"`
+/// Takes a list of lists of strings and turns it to a csv string, automatically
+/// escaping all fields that contain double quotes or line endings.
+///
+/// ## Examples
+///
+/// ```gleam
+/// let rows = [["hello", "world"], ["goodbye", "mars"]]
+/// from_lists(rows, separator: ",", line_ending: Unix)
+/// // "hello,world
+/// // goodbye,mars"
+/// ```
+///
+/// ```gleam
+/// let rows = [[]]
+/// ```
 ///
 pub fn from_lists(
-  input: List(List(String)),
+  rows: List(List(String)),
   separator separator: String,
   line_ending line_ending: LineEnding,
 ) -> String {
-  input
-  |> list.map(fn(row) {
-    list.map(row, fn(entry) {
-      // Double quotes need to be escaped with an extra doublequote
-      let entry = string.replace(entry, "\"", "\"\"")
-
-      // If the string contains a , \n \r\n or " it needs to be escaped by wrapping in double quotes
-      case
-        string.contains(entry, separator)
-        || string.contains(entry, "\n")
-        || string.contains(entry, "\"")
-      {
-        True -> "\"" <> entry <> "\""
-        False -> entry
+  let line_ending = line_ending_to_string(line_ending)
+  do_from_lists(rows, separator, line_ending, "")
+}
+
+fn do_from_lists(
+  rows: List(List(String)),
+  separator: String,
+  line_ending: String,
+  acc: String,
+) -> String {
+  case rows {
+    [] -> acc
+    // When we're down to the last row, we don't add a final newline at the end
+    // of the string. So we special handle this case and pass in an empty string
+    // as the `line_ending` to add to the row.
+    [last_row] -> row_to_string(last_row, separator, "", acc)
+    // For all other cases we just accumulate the line string onto the string
+    // accumulator.
+    [row, ..rest] -> {
+      let acc = row_to_string(row, separator, line_ending, acc)
+      do_from_lists(rest, separator, line_ending, acc)
+    }
+  }
+}
+
+fn row_to_string(
+  row: List(String),
+  separator: String,
+  line_ending: String,
+  acc: String,
+) -> String {
+  case row {
+    [] -> acc
+    // When we're down to the last field of the row we need to add the line
+    // ending instead of the field separator. So we special handle this case.
+    [last_field] -> acc <> escape_field(last_field, separator) <> line_ending
+    // For all other cases we add the field to the accumulator and append a
+    // separator to separate it from the next field in the row.
+    [field, ..rest] -> {
+      let acc = acc <> escape_field(field, separator) <> separator
+      row_to_string(rest, separator, line_ending, acc)
+    }
+  }
+}
+
+/// The kind of escaping needed by a csv field.
+///
+type Escaping {
+  NoEscaping
+  WrapInDoubleQuotes
+  WrapInDoubleQuotesAndEscapeDoubleQuotes
+}
+
+fn escape_field(field: String, separator: String) -> String {
+  case escaping(field, separator) {
+    NoEscaping -> field
+    WrapInDoubleQuotes -> "\"" <> field <> "\""
+    WrapInDoubleQuotesAndEscapeDoubleQuotes ->
+      "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\""
+  }
+}
+
+fn escaping(string: String, separator: String) -> Escaping {
+  do_escaping(string, separator, NoEscaping)
+}
+
+fn do_escaping(string: String, separator: String, kind: Escaping) {
+  case string {
+    // As soon as we find a double quote we know that we must escape the double
+    // quotes and wrap it in double quotes, no need to keep going through the
+    // string.
+    "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes
+    // If we find a newline we know the string must at least be wrapped in
+    // double quotes but we keep going in case we find a `"`.
+    "\n" <> rest -> do_escaping(rest, separator, WrapInDoubleQuotes)
+    // If we reach the end of the string we return the accumulator.
+    "" -> kind
+    // In all other cases we check if the string starts with the separator, in
+    // that case we know it must be at least wrapped in double quotes.
+    // But we keep going in case we find a `"`.
+    _ -> {
+      let assert Ok(#(_, rest)) = string.pop_grapheme(string)
+      case kind == WrapInDoubleQuotes || string.starts_with(string, separator) {
+        True -> do_escaping(rest, separator, WrapInDoubleQuotes)
+        False -> do_escaping(rest, separator, kind)
       }
-    })
-  })
-  |> list.map(fn(row) { string.join(row, separator) })
-  |> string.join(le_to_string(line_ending))
+    }
+  }
 }
 
 /// Takes a list of dicts and writes it to a csv string.
diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index 96a88b3..ea69529 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -221,13 +221,6 @@ pub fn unclosed_escaped_field_test() {
   |> birdie.snap("unclosed escaped field")
 }
 
-pub fn unescaped_carriage_return_test() {
-  todo as "decide what to do"
-  "test\n\r"
-  |> gsv.to_lists
-  |> should.be_error
-}
-
 // --- ENCODING TESTS ----------------------------------------------------------
 
 pub fn encode_test() {

From 8fa68b0551cfa4e978cc52e9c4317b45154cf237 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 12:58:11 +0100
Subject: [PATCH 16/23] from_dicts

---
 src/gsv.gleam | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index 199098d..d2611f7 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -488,36 +488,36 @@ fn do_escaping(string: String, separator: String, kind: Escaping) {
 /// The string `he"llo\n` becomes `"he""llo\n"`
 ///
 pub fn from_dicts(
-  input: List(Dict(String, String)),
+  rows: List(Dict(String, String)),
   separator separator: String,
   line_ending line_ending: LineEnding,
 ) -> String {
-  case input {
+  case rows {
     [] -> ""
     _ -> {
       let headers =
-        input
-        |> list.map(dict.keys)
-        |> list.flatten
+        rows
+        |> list.flat_map(dict.keys)
         |> list.unique
         |> list.sort(string.compare)
 
-      let rows =
-        list.map(input, fn(row) {
-          list.fold(headers, [], fn(acc, h) {
-            case dict.get(row, h) {
-              Ok(v) -> [v, ..acc]
-              Error(Nil) -> ["", ..acc]
-            }
-          })
-        })
-        |> list.map(list.reverse)
-
+      let rows = list.map(rows, row_dict_to_list(_, headers))
       from_lists([headers, ..rows], separator, line_ending)
     }
   }
 }
 
+fn row_dict_to_list(
+  row: Dict(String, String),
+  headers: List(String),
+) -> List(String) {
+  use header <- list.map(headers)
+  case dict.get(row, header) {
+    Ok(field) -> field
+    Error(Nil) -> ""
+  }
+}
+
 // --- FFI HELPERS -------------------------------------------------------------
 
 /// In general this wouldn't be safe, by just slicing random bytes in the middle

From 55ef9eeab9a50e7151e7cb48649171ab6df2515d Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 13:53:55 +0100
Subject: [PATCH 17/23] rebase gone wrong

---
 src/gsv/internal/ast.gleam | 196 -------------------------------------
 1 file changed, 196 deletions(-)
 delete mode 100644 src/gsv/internal/ast.gleam

diff --git a/src/gsv/internal/ast.gleam b/src/gsv/internal/ast.gleam
deleted file mode 100644
index 56f667f..0000000
--- a/src/gsv/internal/ast.gleam
+++ /dev/null
@@ -1,196 +0,0 @@
-//// We are using the following grammar for CSV from rfc4180
-////
-//// file = [header CRLF] record *(CRLF record) [CRLF]
-////   header = name *(COMMA name)
-////  record = field *(COMMA field)
-////  name = field
-////  field = (escaped / non-escaped)
-////  escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE
-////  non-escaped = *TEXTDATA
-
-import gleam/list
-import gleam/result
-import gleam/string
-import gsv/internal/token.{
-  type CsvToken, type Location, CR, Comma, Doublequote, LF, Location, Textdata,
-}
-
-type ParseState {
-  Beginning
-  JustParsedField
-  JustParsedComma
-  JustParsedNewline
-  JustParsedCR
-  InsideEscapedString
-}
-
-pub type ParseError {
-  ParseError(location: Location, message: String)
-}
-
-pub fn parse(
-  input: List(#(CsvToken, Location)),
-) -> Result(List(List(String)), ParseError) {
-  let inner_rev = {
-    use llf <- result.try(parse_p(input, Beginning, []))
-    use lf <- list.try_map(llf)
-    Ok(list.reverse(lf))
-  }
-  use ir <- result.try(inner_rev)
-  Ok(list.reverse(ir))
-}
-
-fn parse_p(
-  input: List(#(CsvToken, Location)),
-  parse_state: ParseState,
-  llf: List(List(String)),
-) -> Result(List(List(String)), ParseError) {
-  case input, parse_state, llf {
-    // Error Case: An empty list should produce an Error
-    [], Beginning, _ -> Error(ParseError(Location(0, 0), "Empty input"))
-
-    // BASE CASE: We are done parsing tokens
-    [], _, llf -> Ok(llf)
-
-    // File should begin with either Escaped or Nonescaped string
-    [#(Textdata(str), _), ..remaining_tokens], Beginning, [] ->
-      parse_p(remaining_tokens, JustParsedField, [[string.trim(str)]])
-
-    [#(Doublequote, _), ..remaining_tokens], Beginning, [] ->
-      parse_p(remaining_tokens, InsideEscapedString, [[""]])
-
-    [#(tok, loc), ..], Beginning, _ ->
-      Error(ParseError(
-        loc,
-        "Unexpected start to csv content: " <> token.to_lexeme(tok),
-      ))
-
-    // If we just parsed a field, we're expecting either a comma or a CRLF
-    [#(Comma, _), ..remaining_tokens], JustParsedField, llf ->
-      parse_p(remaining_tokens, JustParsedComma, llf)
-
-    [#(LF, _), ..remaining_tokens], JustParsedField, llf ->
-      parse_p(remaining_tokens, JustParsedNewline, llf)
-
-    [#(CR, _), ..remaining_tokens], JustParsedField, llf ->
-      parse_p(remaining_tokens, JustParsedCR, llf)
-
-    [#(tok, loc), ..], JustParsedField, _ ->
-      Error(ParseError(
-        loc,
-        "Expected comma or newline after field, found: " <> token.to_lexeme(tok),
-      ))
-
-    // If we just parsed a CR, we're expecting an LF
-    [#(LF, _), ..remaining_tokens], JustParsedCR, llf ->
-      parse_p(remaining_tokens, JustParsedNewline, llf)
-
-    [#(tok, loc), ..], JustParsedCR, _ ->
-      Error(ParseError(
-        loc,
-        "Expected \"\\n\" after \"\\r\", found: " <> token.to_lexeme(tok),
-      ))
-
-    // If we just parsed a comma, we're expecting an Escaped or Non-Escaped string, or another comma
-    // (indicating an empty string)
-    [#(Textdata(str), _), ..remaining_tokens],
-      JustParsedComma,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, JustParsedField, [
-        [string.trim(str), ..curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(Doublequote, _), ..remaining_tokens],
-      JustParsedComma,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, InsideEscapedString, [
-        ["", ..curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(Comma, _), ..remaining_tokens],
-      JustParsedComma,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, JustParsedComma, [
-        ["", ..curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(CR, _), ..remaining_tokens],
-      JustParsedComma,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, JustParsedCR, [
-        ["", ..curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(LF, _), ..remaining_tokens],
-      JustParsedComma,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, JustParsedNewline, [
-        ["", ..curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(tok, loc), ..], JustParsedComma, _ ->
-      Error(ParseError(
-        loc,
-        "Expected escaped or non-escaped string after comma, found: "
-          <> token.to_lexeme(tok),
-      ))
-
-    // If we just parsed a new line, we're expecting an escaped or non-escaped string
-    [#(Textdata(str), _), ..remaining_tokens], JustParsedNewline, llf ->
-      parse_p(remaining_tokens, JustParsedField, [[string.trim(str)], ..llf])
-
-    [#(Doublequote, _), ..remaining_tokens],
-      JustParsedNewline,
-      [curr_line, ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, InsideEscapedString, [
-        [""],
-        curr_line,
-        ..previously_parsed_lines
-      ])
-
-    [#(tok, loc), ..], JustParsedNewline, _ ->
-      Error(ParseError(
-        loc,
-        "Expected escaped or non-escaped string after newline, found: "
-          <> token.to_lexeme(tok),
-      ))
-
-    // If we're inside an escaped string, we can take anything until we get a double quote,
-    // but a double double quote "" escapes the double quote and we keep parsing
-    [#(Doublequote, _), #(Doublequote, _), ..remaining_tokens],
-      InsideEscapedString,
-      [[str, ..rest_curr_line], ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, InsideEscapedString, [
-        [str <> "\"", ..rest_curr_line],
-        ..previously_parsed_lines
-      ])
-
-    [#(Doublequote, _), ..remaining_tokens], InsideEscapedString, llf ->
-      parse_p(remaining_tokens, JustParsedField, llf)
-
-    [#(other_token, _), ..remaining_tokens],
-      InsideEscapedString,
-      [[str, ..rest_curr_line], ..previously_parsed_lines]
-    ->
-      parse_p(remaining_tokens, InsideEscapedString, [
-        [str <> token.to_lexeme(other_token), ..rest_curr_line],
-        ..previously_parsed_lines
-      ])
-
-    // Anything else is an error
-    [#(tok, loc), ..], _, _ ->
-      Error(ParseError(loc, "Unexpected token: " <> token.to_lexeme(tok)))
-  }
-}

From a1b15f46be2ce3238b4c4239c06db59b63b89ccb Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 13:58:44 +0100
Subject: [PATCH 18/23] changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb0a433..d024a31 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 
 ## Unreleased
+- Improved performance of `to_lists`, `to_dicts`, `from_lists` and `from_lists`.
+- Parsing now doesn't trim the csv fields, conforming to RFC4180.
+- The `to_lists` and `to_dicts` functions now return a structured error instead
+  of a `String`.
 
 ## v2.0.3 - 25 October 2024
 - Patch to remove some unused imports.

From 864fc8dfc62ce0bdeed5bb404abfb2e1aad8876a Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 14:22:34 +0100
Subject: [PATCH 19/23] readme

---
 README.md | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 1c8570b..7507a3f 100644
--- a/README.md
+++ b/README.md
@@ -3,42 +3,30 @@
 [![Package Version](https://img.shields.io/hexpm/v/gsv)](https://hex.pm/packages/gsv)
 [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/gsv/)
 
-This is a simple csv parser and writer for Gleam. It will get more performant/battle tested in the future,
-but if you're looking for that now, I'd recommend doing ffi to an existing parser in your target runtime.
-
-#### Example
+A simple csv parser and serialiser for Gleam.
 
 ```gleam
 import gsv.{Unix, Windows}
 
 pub fn main() {
-  let csv_str =
-    "Hello,World
-Goodbye,Mars"
-
-  // Parse a CSV string to a List(List(String))
-  let assert Ok(records) = gsv.to_lists(csv_str)
+  let csv =
+    "name,loves
+lucy,gleam"
 
-  // Write a List(List(String)) to a CSV string
-  let csv_str = records
-  |> gsv.from_lists(separator: ",", line_ending: Windows)
+  // Parse a csv string into a list of rows.
+  let assert Ok(rows) = gsv.to_lists(csv)
+  // -> [["name", "loves"], ["lucy", "gleam"]]
 
-  // Parse a CSV string with headers to a List(Dict(String, String))
-  let assert Ok(records) = gsv.to_dicts(csv_str)
-  // => [ dict.from_list([ #("Hello", "Goodbye"), #("World", "Mars") ]) ]
-
-  // Write a List(Dict(String, String)) to a CSV string, treating the keys as the header row
-  let csv_str = records
-    |> gsv.from_dicts(separator: ",", line_ending: Windows)
+  // If your csv has headers you can also parse it into a list of dictionaries.
+  let assert Ok(rows) = gsv.to_dicts(csv_str)
+  // -> dict.from_list([#("name", "lucy"), #("loves", "gleam")])
 }
 ```
 
 ## Installation
 
-If available on Hex this package can be added to your Gleam project:
+To add this package to your Gleam project:
 
 ```sh
-gleam add gsv
+gleam add gsv@3
 ```
-
-and its documentation can be found at <https://hexdocs.pm/gsv>.

From 7d607f449f79436afd48683fc8fc7116a02e5736 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 15:05:20 +0100
Subject: [PATCH 20/23] improve from_lists

---
 src/gsv.gleam | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index d2611f7..dbecb82 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -392,15 +392,17 @@ pub fn from_lists(
   line_ending line_ending: LineEnding,
 ) -> String {
   let line_ending = line_ending_to_string(line_ending)
-  do_from_lists(rows, separator, line_ending, "")
+  do_from_lists(rows, separator, line_ending, [])
+  |> list.reverse
+  |> string.join(with: "")
 }
 
 fn do_from_lists(
   rows: List(List(String)),
   separator: String,
   line_ending: String,
-  acc: String,
-) -> String {
+  acc: List(String),
+) -> List(String) {
   case rows {
     [] -> acc
     // When we're down to the last row, we don't add a final newline at the end
@@ -420,17 +422,19 @@ fn row_to_string(
   row: List(String),
   separator: String,
   line_ending: String,
-  acc: String,
-) -> String {
+  acc: List(String),
+) -> List(String) {
   case row {
     [] -> acc
+
     // When we're down to the last field of the row we need to add the line
     // ending instead of the field separator. So we special handle this case.
-    [last_field] -> acc <> escape_field(last_field, separator) <> line_ending
+    [last_field] -> [line_ending, escape_field(last_field, separator), ..acc]
+
     // For all other cases we add the field to the accumulator and append a
     // separator to separate it from the next field in the row.
     [field, ..rest] -> {
-      let acc = acc <> escape_field(field, separator) <> separator
+      let acc = [separator, escape_field(field, separator), ..acc]
       row_to_string(rest, separator, line_ending, acc)
     }
   }
@@ -454,10 +458,13 @@ fn escape_field(field: String, separator: String) -> String {
 }
 
 fn escaping(string: String, separator: String) -> Escaping {
-  do_escaping(string, separator, NoEscaping)
+  case string.contains(string, separator) {
+    True -> do_escaping(string, WrapInDoubleQuotes)
+    False -> do_escaping(string, NoEscaping)
+  }
 }
 
-fn do_escaping(string: String, separator: String, kind: Escaping) {
+fn do_escaping(string: String, kind: Escaping) {
   case string {
     // As soon as we find a double quote we know that we must escape the double
     // quotes and wrap it in double quotes, no need to keep going through the
@@ -465,19 +472,11 @@ fn do_escaping(string: String, separator: String, kind: Escaping) {
     "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes
     // If we find a newline we know the string must at least be wrapped in
     // double quotes but we keep going in case we find a `"`.
-    "\n" <> rest -> do_escaping(rest, separator, WrapInDoubleQuotes)
+    "\n" <> rest -> do_escaping(rest, WrapInDoubleQuotes)
     // If we reach the end of the string we return the accumulator.
     "" -> kind
-    // In all other cases we check if the string starts with the separator, in
-    // that case we know it must be at least wrapped in double quotes.
-    // But we keep going in case we find a `"`.
-    _ -> {
-      let assert Ok(#(_, rest)) = string.pop_grapheme(string)
-      case kind == WrapInDoubleQuotes || string.starts_with(string, separator) {
-        True -> do_escaping(rest, separator, WrapInDoubleQuotes)
-        False -> do_escaping(rest, separator, kind)
-      }
-    }
+    // In all other cases we keep looking.
+    _ -> do_escaping(drop_bytes(string, 1), kind)
   }
 }
 

From 852d4226efa9dc34e1d0b530f9f04330bbf86dad Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 15:21:22 +0100
Subject: [PATCH 21/23] improve from_lists

---
 src/gsv.gleam | 93 +++++++--------------------------------------------
 1 file changed, 12 insertions(+), 81 deletions(-)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index dbecb82..9bf2068 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -392,91 +392,22 @@ pub fn from_lists(
   line_ending line_ending: LineEnding,
 ) -> String {
   let line_ending = line_ending_to_string(line_ending)
-  do_from_lists(rows, separator, line_ending, [])
-  |> list.reverse
-  |> string.join(with: "")
-}
-
-fn do_from_lists(
-  rows: List(List(String)),
-  separator: String,
-  line_ending: String,
-  acc: List(String),
-) -> List(String) {
-  case rows {
-    [] -> acc
-    // When we're down to the last row, we don't add a final newline at the end
-    // of the string. So we special handle this case and pass in an empty string
-    // as the `line_ending` to add to the row.
-    [last_row] -> row_to_string(last_row, separator, "", acc)
-    // For all other cases we just accumulate the line string onto the string
-    // accumulator.
-    [row, ..rest] -> {
-      let acc = row_to_string(row, separator, line_ending, acc)
-      do_from_lists(rest, separator, line_ending, acc)
-    }
-  }
-}
-
-fn row_to_string(
-  row: List(String),
-  separator: String,
-  line_ending: String,
-  acc: List(String),
-) -> List(String) {
-  case row {
-    [] -> acc
-
-    // When we're down to the last field of the row we need to add the line
-    // ending instead of the field separator. So we special handle this case.
-    [last_field] -> [line_ending, escape_field(last_field, separator), ..acc]
-
-    // For all other cases we add the field to the accumulator and append a
-    // separator to separate it from the next field in the row.
-    [field, ..rest] -> {
-      let acc = [separator, escape_field(field, separator), ..acc]
-      row_to_string(rest, separator, line_ending, acc)
-    }
-  }
-}
 
-/// The kind of escaping needed by a csv field.
-///
-type Escaping {
-  NoEscaping
-  WrapInDoubleQuotes
-  WrapInDoubleQuotesAndEscapeDoubleQuotes
+  list.map(rows, fn(row) {
+    list.map(row, escape_field(_, separator))
+    |> string.join(with: separator)
+  })
+  |> string.join(with: line_ending)
 }
 
 fn escape_field(field: String, separator: String) -> String {
-  case escaping(field, separator) {
-    NoEscaping -> field
-    WrapInDoubleQuotes -> "\"" <> field <> "\""
-    WrapInDoubleQuotesAndEscapeDoubleQuotes ->
-      "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\""
-  }
-}
-
-fn escaping(string: String, separator: String) -> Escaping {
-  case string.contains(string, separator) {
-    True -> do_escaping(string, WrapInDoubleQuotes)
-    False -> do_escaping(string, NoEscaping)
-  }
-}
-
-fn do_escaping(string: String, kind: Escaping) {
-  case string {
-    // As soon as we find a double quote we know that we must escape the double
-    // quotes and wrap it in double quotes, no need to keep going through the
-    // string.
-    "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes
-    // If we find a newline we know the string must at least be wrapped in
-    // double quotes but we keep going in case we find a `"`.
-    "\n" <> rest -> do_escaping(rest, WrapInDoubleQuotes)
-    // If we reach the end of the string we return the accumulator.
-    "" -> kind
-    // In all other cases we keep looking.
-    _ -> do_escaping(drop_bytes(string, 1), kind)
+  case string.contains(field, "\"") {
+    True -> "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\""
+    False ->
+      case string.contains(field, separator) || string.contains(field, "\n") {
+        True -> "\"" <> field <> "\""
+        False -> field
+      }
   }
 }
 

From 28c4fa208d9f01e8781f2f43329cfc2805d72f84 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 15:25:24 +0100
Subject: [PATCH 22/23] document other differences

---
 src/gsv.gleam | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gsv.gleam b/src/gsv.gleam
index 9bf2068..215f9a5 100644
--- a/src/gsv.gleam
+++ b/src/gsv.gleam
@@ -73,6 +73,8 @@ fn line_ending_to_string(le: LineEnding) -> String {
 /// > - both `\n` and `\r\n` line endings are accepted.
 /// > - a line can start with an empty field `,two,three`.
 /// > - empty lines are allowed and just ignored.
+/// > - lines are not forced to all have the same number of fields.
+/// > - a line can end with a comma (meaning its last field is empty).
 ///
 pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
   case input {

From dd9cfbb5dbf127ce5edd0c8870db6f35642631c9 Mon Sep 17 00:00:00 2001
From: Giacomo Cavalieri <giacomo.cavalieri@icloud.com>
Date: Sun, 27 Oct 2024 15:27:52 +0100
Subject: [PATCH 23/23] add test for line with different number of fields

---
 test/gsv_test.gleam | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam
index ea69529..18f463e 100644
--- a/test/gsv_test.gleam
+++ b/test/gsv_test.gleam
@@ -130,6 +130,14 @@ pub fn escaped_field_with_escaped_double_quotes_test() {
   |> should.equal([["escaped double quote -> \""]])
 }
 
+pub fn rows_with_different_number_of_fields_test() {
+  "three,fields,woo
+only,two"
+  |> gsv.to_lists
+  |> should.be_ok
+  |> should.equal([["three", "fields", "woo"], ["only", "two"]])
+}
+
 // --- DICT PARSING ------------------------------------------------------------
 
 pub fn headers_test() {