From b87cde6bfc67fcacd6dbc39e984146f71911e729 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 14:11:26 +0200 Subject: [PATCH 01/23] wip --- gleam.toml | 2 +- manifest.toml | 6 +- src/gsv.gleam | 14 +- src/gsv/internal/parse.gleam | 261 +++++++++++++++++++++++++++++++++++ src/gsv_ffi.erl | 11 ++ src/gsv_ffi.mjs | 7 + 6 files changed, 290 insertions(+), 11 deletions(-) create mode 100644 src/gsv/internal/parse.gleam create mode 100644 src/gsv_ffi.erl create mode 100644 src/gsv_ffi.mjs diff --git a/gleam.toml b/gleam.toml index 0161427..5080dd0 100644 --- a/gleam.toml +++ b/gleam.toml @@ -12,7 +12,7 @@ internal_modules = [ ] [dependencies] -gleam_stdlib = "~> 0.34 or ~> 1.0" +gleam_stdlib = ">= 0.40.0 and < 1.0.0" [dev-dependencies] gleeunit = "~> 1.0" diff --git a/manifest.toml b/manifest.toml index 7762492..5cfd9ac 100644 --- a/manifest.toml +++ b/manifest.toml @@ -2,10 +2,10 @@ # You typically do not need to edit this file packages = [ - { name = "gleam_stdlib", version = "0.34.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "1FB8454D2991E9B4C0C804544D8A9AD0F6184725E20D63C3155F0AEB4230B016" }, - { name = "gleeunit", version = "1.0.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "D364C87AFEB26BDB4FB8A5ABDE67D635DC9FA52D6AB68416044C35B096C6882D" }, + { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" }, + { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" }, ] [requirements] -gleam_stdlib = { version = "~> 0.34 or ~> 1.0" } +gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" } gleeunit = { version = "~> 1.0" } diff --git a/src/gsv.gleam b/src/gsv.gleam index 25b031a..b3dafa1 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -9,7 +9,7 @@ import gsv/internal/token.{Location} /// Parses a csv string to a list of lists of strings. /// Automatically handles Windows and Unix line endings. /// Returns a string error msg if the string is not valid csv. -/// Unquoted strings are trimmed, while quoted strings have leading and trailing +/// Unquoted strings are trimmed, while quoted strings have leading and trailing /// whitespace preserved. pub fn to_lists(input: String) -> Result(List(List(String)), String) { input @@ -28,14 +28,14 @@ pub fn to_lists(input: String) -> Result(List(List(String)), String) { }) } -/// Parses a csv string to a list of dicts. +/// Parses a csv string to a list of dicts. /// Automatically handles Windows and Unix line endings. /// Returns a string error msg if the string is not valid csv. -/// Unquoted strings are trimmed, while quoted strings have leading and trailing +/// Unquoted strings are trimmed, while quoted strings have leading and trailing /// whitespace preserved. -/// Whitespace only or empty strings are not valid headers and will be ignored. -/// Whitespace only or empty strings are not considered "present" in the csv row and -/// are not inserted into the row dict. +/// Whitespace only or empty strings are not valid headers and will be ignored. +/// Whitespace only or empty strings are not considered "present" in the csv row and +/// are not inserted into the row dict. pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) { use lol <- result.try(to_lists(input)) case lol { @@ -66,7 +66,7 @@ pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) { } /// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows" -/// line endings. Use with the `from_lists` function when +/// line endings. Use with the `from_lists` function when /// writing to a csv string. pub type LineEnding { Windows diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam new file mode 100644 index 0000000..e3e462e --- /dev/null +++ b/src/gsv/internal/parse.gleam @@ -0,0 +1,261 @@ +import gleam/list +import gleam/string + +pub fn parse(string) -> Result(List(List(String)), ParseError) { + case string { + // We just ignore all unescaped newlines at the beginning of a file. + "\n" <> rest | "\r\n" <> rest -> parse(rest) + // If it starts with a `"` then we know it starts with an escaped field. + "\"" <> rest -> do_parse(rest, string, 1, 0, [], [], ParsingEscapedField) + // If it starts with a `,` then it starts with an empty field we're filling + // out manually. + "," <> rest -> do_parse(rest, string, 1, 0, [""], [], CommaFound) + // Otherwise we just start parsing the first unescaped field. + _ -> do_parse(string, string, 0, 0, [], [], ParsingUnescapedField) + } +} + +pub type ParseError { + /// A field can contain a double quote only if it is escaped (that is, + /// surrounded by double quotes). For example `wobb"le` would be an invalid + /// field, the correct way to write such a field would be like this: + /// `"wobb""le"`. + /// + UnescapedQuote( + /// The byte index of the unescaped double. + position: Int, + ) + + /// This error can occur when the file ends without the closing `"` of an + /// escaped field. For example: `"hello`. + /// + UnclosedEscapedField( + /// The byte index of the start of the unclosed escaped field. + start: Int, + ) +} + +type ParseStatus { + ParsingEscapedField + ParsingUnescapedField + CommaFound + NewlineFound +} + +/// ## What does this scary looking function do? +/// +/// At a high level, it goes over the csv `string` byte-by-byte and parses rows +/// accumulating those into `rows` as it goes. +/// +/// +/// ## Why does it have all these parameters? What does each one do? +/// +/// In order to be extra efficient this function parses the csv file in a single +/// pass and uses string slicing to avoid copying data. +/// Each time we see a new field we keep track of the byte where it starts with +/// `field_start` and then count the bytes (that's the `field_length` variable) +/// until we fiend its end (either a newline, the end of the file, or a `,`). +/// +/// After reaching the end of a field we extract it from the original string +/// taking a slice that goes from `field_start` and has `field_length` bytes. +/// This is where the magic happens: slicing a string this way is a constant +/// time operation and doesn't copy the string so it's crazy fast! +/// +/// `row` is an accumulator with all the fields of the current row as +/// they are parsed. Once we run into a newline `current_row` is added to all +/// the other `rows`. +/// +/// We also keep track of _what_ we're parsing with the `status` to make +/// sure that we're correctly dealing with escaped fields and double quotes. +/// +fn do_parse( + string: String, + original: String, + field_start: Int, + field_length: Int, + row: List(String), + rows: List(List(String)), + status: ParseStatus, +) -> Result(List(List(String)), ParseError) { + case string, status { + // If we find a comma we're done with the current field and can take a slice + // going from `field_start` with `field_length` bytes: + // + // wibble,wobble,... + // ╰────╯ field_length = 6 + // ┬ + // ╰ field_start + // + // After taking the slice we move the slice start _after_ the comma: + // + // wibble,wobble,... + // ┬ + // ╰ field_start = field_start + field_length + 1 (the comma) + // + "," <> rest, CommaFound + | "," <> rest, NewlineFound + | "," <> rest, ParsingUnescapedField + -> { + let field = extract_field(original, field_start, field_length, status) + let row = [field, ..row] + let field_start = field_start + field_length + 1 + do_parse(rest, original, field_start, 0, row, rows, CommaFound) + } + "\"," <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = [field, ..row] + let field_start = field_start + field_length + 2 + do_parse(rest, original, field_start, 0, row, rows, CommaFound) + } + + // When the string is over we're done parsing. + // We take the final field we were in the middle of parsing and add it to + // the current row that is returned together with all the parsed rows. + // + "", ParsingUnescapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + Ok(list.reverse([row, ..rows])) + } + + "", CommaFound -> { + let row = list.reverse(["", ..row]) + Ok(list.reverse([row, ..rows])) + } + + "", NewlineFound -> Ok(list.reverse(rows)) + + // If the string is over and we were parsing an escaped field, that's an + // error. We would expect to find a closing double quote before the end of + // the data. + // + "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start)) + + // When we run into a new line (CRLF or just LF) we know we're done with the + // current field and take a slice of it, just like we did in the previous + // branch! + // The only difference is we also add the current `row` to all the other + // ones and start with a new one. + // + // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF. + // > Here we do something slightly different and also accept lines that are + // > delimited by just LF too. + // + // The next three branches are the same except for the new `field_start` + // that has to take into account the different lengths. + // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking + // adding the lenght of that but it had a noticeable (albeit small) impact + // on performance. + // + "\n" <> rest, ParsingUnescapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 1 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 2 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + "\"\r\n" <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 3 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + + // If the newlines is immediately after a comma then the row ends with an + // empty field. + // + "\n" <> rest, CommaFound -> { + let row = list.reverse(["", ..row]) + let rows = [row, ..rows] + do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound) + } + "\r\n" <> rest, CommaFound -> { + let row = list.reverse(["", ..row]) + let rows = [row, ..rows] + do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound) + } + + // If the newline immediately comes after a newline that means we've run + // into an empty line that we can just safely ignore. + // + "\n" <> rest, NewlineFound -> + do_parse(rest, original, field_start + 1, 0, row, rows, status) + "\r\n" <> rest, NewlineFound -> + do_parse(rest, original, field_start + 2, 0, row, rows, status) + + // An escaped quote found while parsing an escaped field. + // + "\"\"" <> rest, ParsingEscapedField -> + do_parse(rest, original, field_start, field_length + 2, row, rows, status) + + // An unescaped quote found while parsing a field. + // + "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField -> + Error(UnescapedQuote(position: field_start + field_length)) + + // If the quote is found immediately after a comma or a newline that signals + // the start of a new escaped field to parse. + // + "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> { + let status = ParsingEscapedField + do_parse(rest, original, field_start + 1, 0, row, rows, status) + } + + // In all other cases we're still parsing a field so we just drop a byte + // from the string we're iterating through, increase the size of the slice + // we need to take and keep going. + // + // > ⚠️ Notice how we're not trying to trim any whitespaces at the + // > beginning or end of a field: RFC 4810 states that "Spaces are + // > considered part of a field and should not be ignored." + // + _, CommaFound + | _, NewlineFound + | _, ParsingUnescapedField + | _, ParsingEscapedField + -> { + let status = case status { + ParsingEscapedField -> ParsingEscapedField + CommaFound | NewlineFound | ParsingUnescapedField -> + ParsingUnescapedField + } + let rest = drop_bytes(string, 1) + do_parse(rest, original, field_start, field_length + 1, row, rows, status) + } + } +} + +fn extract_field( + string: String, + from: Int, + length: Int, + status: ParseStatus, +) -> String { + let field = slice_bytes(string, from, length) + case status { + CommaFound | ParsingUnescapedField | NewlineFound -> field + ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"") + } +} + +/// In general this wouldn't be safe, by just slicing random bytes in the middle +/// of a utf8 string we might end up with something that is not a valid utf8 +/// string. +/// However, the parser only slices fields in between commas so it should always +/// yield valid utf8 slices. +/// +@external(erlang, "gsv_ffi", "slice") +@external(javascript, "../../gsv_ffi.mjs", "slice") +fn slice_bytes(string: String, from: Int, length: Int) -> String + +@external(erlang, "gsv_ffi", "drop_bytes") +@external(javascript, "../../gsv_ffi.mjs", "drop_bytes") +fn drop_bytes(string: String, bytes: Int) -> String diff --git a/src/gsv_ffi.erl b/src/gsv_ffi.erl new file mode 100644 index 0000000..acec49e --- /dev/null +++ b/src/gsv_ffi.erl @@ -0,0 +1,11 @@ +-module(gsv_ffi). +-export([slice/3, drop_bytes/2]). + +slice(String, Index, Length) -> + binary:part(String, Index, Length). + +drop_bytes(String, Bytes) -> + case String of + <<_:Bytes/bytes, Rest/binary>> -> Rest; + <<>> -> <<>> + end. diff --git a/src/gsv_ffi.mjs b/src/gsv_ffi.mjs new file mode 100644 index 0000000..468b4dc --- /dev/null +++ b/src/gsv_ffi.mjs @@ -0,0 +1,7 @@ +export function slice(string, start, size) { + return string.slice(start, start + size); +} + +export function drop_bytes(string, bytes) { + return string.slice(bytes); +} From 71c56ccafade8538d7503e95cae5086d446cf1dd Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 14:48:52 +0200 Subject: [PATCH 02/23] Remove deprecated code --- test/gsv_test.gleam | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 4a14ebc..41291dc 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -1,8 +1,5 @@ import gleam/dict -import gleam/int -import gleam/list import gleam/result -import gleam/string import gleeunit import gleeunit/should import gsv.{Unix, Windows} @@ -78,30 +75,6 @@ pub fn last_line_has_optional_line_ending_test() { |> should.equal(Ok([["test"], ["test"], ["test"]])) } -// ---------- Example doing CSV string -> Custom type ------------------------ -pub type User { - User(name: String, age: Int) -} - -fn from_list(record: List(String)) -> Result(User, Nil) { - use name <- result.try(list.at(record, 0)) - use age_str <- result.try(list.at(record, 1)) - use age <- result.try(int.parse(string.trim(age_str))) - Ok(User(name, age)) -} - -pub fn decode_to_type_test() { - let assert Ok(lls) = - "Ben, 25\nAustin, 21" - |> gsv.to_lists - let users = - list.fold(lls, [], fn(acc, record) { [from_list(record), ..acc] }) - |> list.reverse - - users - |> should.equal([Ok(User("Ben", 25)), Ok(User("Austin", 21))]) -} - // --------------------------------------------------------------------------- pub fn encode_test() { From 73d451d7b541aed88a268ac7735e087ac158dbf0 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 14:51:01 +0200 Subject: [PATCH 03/23] stop testing internals of the library --- test/gsv_test.gleam | 69 --------------------------------------------- 1 file changed, 69 deletions(-) diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 41291dc..904acab 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -1,62 +1,18 @@ import gleam/dict -import gleam/result import gleeunit import gleeunit/should import gsv.{Unix, Windows} -import gsv/internal/ast.{ParseError, parse} -import gsv/internal/token.{ - CR, Comma, Doublequote, LF, Location, Textdata, scan, with_location, -} pub fn main() { gleeunit.main() } -pub fn scan_test() { - "Ben, 25,\" TRUE\r\n\"" - |> scan - |> should.equal([ - Textdata("Ben"), - Comma, - Textdata(" 25"), - Comma, - Doublequote, - Textdata(" TRUE"), - CR, - LF, - Doublequote, - ]) -} - -pub fn parse_test() { - "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE" - |> scan - |> with_location - |> parse - |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]])) -} - -pub fn parse_empty_string_fail_test() { - "" - |> scan - |> with_location - |> parse - |> result.nil_error - |> should.equal(Error(Nil)) -} - pub fn csv_parse_test() { "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE" |> gsv.to_lists |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]])) } -pub fn scan_crlf_test() { - "\r\n" - |> scan - |> should.equal([CR, LF]) -} - pub fn parse_crlf_test() { "test\ntest\r\ntest" |> gsv.to_lists @@ -116,31 +72,6 @@ pub fn for_the_readme_test() { |> should.equal("Hello,World\r\nGoodbye,Mars") } -pub fn error_cases_test() { - let produce_error = fn(csv_str) { - case - csv_str - |> scan - |> with_location - |> parse - { - Ok(_) -> panic as "Expected an error" - Error(ParseError(loc, msg)) -> #(loc, msg) - } - } - - produce_error("Ben, 25,\n, TRUE") - |> should.equal(#( - Location(2, 1), - "Expected escaped or non-escaped string after newline, found: ,", - )) - produce_error("Austin, 25, FALSE\n\"Ben Peinhardt\", 25,\n, TRUE") - |> should.equal(#( - Location(3, 1), - "Expected escaped or non-escaped string after newline, found: ,", - )) -} - // pub fn totally_panics_test() { // "Ben, 25,, TRUE" |> gsv.to_lists_or_panic // } From f7ddd4e67e63678df7574c34a44d224178cfbb4d Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 14:51:17 +0200 Subject: [PATCH 04/23] remove commented unused test --- test/gsv_test.gleam | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 904acab..a3a4afc 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -72,10 +72,6 @@ pub fn for_the_readme_test() { |> should.equal("Hello,World\r\nGoodbye,Mars") } -// pub fn totally_panics_test() { -// "Ben, 25,, TRUE" |> gsv.to_lists_or_panic -// } - pub fn totally_doesnt_error_test() { "Ben, 25,, TRUE" |> gsv.to_lists From 8c7a66870e566076ac708849189d37f3fbaae6fb Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 15:46:12 +0200 Subject: [PATCH 05/23] fix small bug --- src/gsv/internal/parse.gleam | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam index e3e462e..2c54d3d 100644 --- a/src/gsv/internal/parse.gleam +++ b/src/gsv/internal/parse.gleam @@ -112,7 +112,7 @@ fn do_parse( // We take the final field we were in the middle of parsing and add it to // the current row that is returned together with all the parsed rows. // - "", ParsingUnescapedField -> { + "", ParsingUnescapedField | "\"", ParsingEscapedField -> { let field = extract_field(original, field_start, field_length, status) let row = list.reverse([field, ..row]) Ok(list.reverse([row, ..rows])) From 2398cd779ca7c19f5194a3d2264cd5a52deb7cf3 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 15:46:39 +0200 Subject: [PATCH 06/23] use explicit newline in readme example --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a30dbfc..1c8570b 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,9 @@ but if you're looking for that now, I'd recommend doing ffi to an existing parse import gsv.{Unix, Windows} pub fn main() { - let csv_str = "Hello, World\nGoodbye, Mars" + let csv_str = + "Hello,World +Goodbye,Mars" // Parse a CSV string to a List(List(String)) let assert Ok(records) = gsv.to_lists(csv_str) From 0a0c592906492fd9a7e4aec05202f9b11570dcf9 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 15:47:14 +0200 Subject: [PATCH 07/23] use new implementation for to_lists --- src/gsv.gleam | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index b3dafa1..b467ded 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -1,10 +1,9 @@ import gleam/dict.{type Dict} -import gleam/int +import gleam/io import gleam/list import gleam/result import gleam/string -import gsv/internal/ast.{ParseError} -import gsv/internal/token.{Location} +import gsv/internal/parse /// Parses a csv string to a list of lists of strings. /// Automatically handles Windows and Unix line endings. @@ -12,19 +11,10 @@ import gsv/internal/token.{Location} /// Unquoted strings are trimmed, while quoted strings have leading and trailing /// whitespace preserved. pub fn to_lists(input: String) -> Result(List(List(String)), String) { - input - |> token.scan - |> token.with_location - |> ast.parse - |> result.map_error(fn(e) { - let ParseError(Location(line, column), msg) = e - "[" - <> "line " - <> int.to_string(line) - <> " column " - <> int.to_string(column) - <> "] of csv: " - <> msg + parse.parse(input) + |> result.map_error(fn(error) { + io.debug(error) + todo as "decide what to do with errors" }) } From beb1617a3036288cb6b56dd4f12a74f1c6c55d2d Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 15:47:25 +0200 Subject: [PATCH 08/23] bunch of tests --- test/gsv_test.gleam | 216 +++++++++++++++++++++++++++++++++----------- 1 file changed, 162 insertions(+), 54 deletions(-) diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index a3a4afc..ab40085 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -1,89 +1,135 @@ import gleam/dict +import gleam/string import gleeunit import gleeunit/should -import gsv.{Unix, Windows} +import gsv.{type LineEnding, Unix, Windows} pub fn main() { gleeunit.main() } +// --- LISTS PARSING ----------------------------------------------------------- + pub fn csv_parse_test() { - "Ben, 25,\" TRUE\n\r\"\"\"\nAustin, 25, FALSE" + "Ben,25,true +Austin,25,false" |> gsv.to_lists - |> should.equal(Ok([["Ben", "25", " TRUE\n\r\""], ["Austin", "25", "FALSE"]])) + |> should.be_ok + |> should.equal([["Ben", "25", "true"], ["Austin", "25", "false"]]) } -pub fn parse_crlf_test() { - "test\ntest\r\ntest" +pub fn csv_with_crlf_test() { + "Ben,25,true\r +Austin,25,false" |> gsv.to_lists - |> should.equal(Ok([["test"], ["test"], ["test"]])) + |> should.be_ok + |> should.equal([["Ben", "25", "true"], ["Austin", "25", "false"]]) } -pub fn parse_lfcr_fails_test() { - "test\n\r" +pub fn csv_with_mixed_newline_kinds_test() { + "one +two\r +three" |> gsv.to_lists - |> should.be_error + |> should.equal(Ok([["one"], ["two"], ["three"]])) } -pub fn last_line_has_optional_line_ending_test() { - "test\ntest\r\ntest\n" +pub fn whitespace_is_not_trimmed_from_fields_test() { + "Ben , 25 , true +Austin , 25 , false" |> gsv.to_lists - |> should.equal(Ok([["test"], ["test"], ["test"]])) + |> should.be_ok + |> should.equal([["Ben ", " 25 ", " true"], ["Austin ", " 25 ", " false"]]) } -// --------------------------------------------------------------------------- +pub fn empty_lines_are_ignored_test() { + " +one -pub fn encode_test() { - let assert Ok(lls) = gsv.to_lists("Ben, 25\nAustin, 21") - lls - |> gsv.from_lists(separator: ",", line_ending: Unix) - |> should.equal("Ben,25\nAustin,21") +two\r +\r +three" + |> gsv.to_lists + |> should.be_ok + |> should.equal([["one"], ["two"], ["three"]]) } -pub fn encode_with_escaped_string_test() { - let assert Ok(lls) = - "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE" - |> gsv.to_lists +pub fn last_line_can_end_with_newline_test() { + "one\ntwo\n" + |> gsv.to_lists + |> should.be_ok + |> should.equal([["one"], ["two"]]) +} - lls - |> gsv.from_lists(separator: ",", line_ending: Unix) - |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\nAustin,25,FALSE") +pub fn empty_fields_test() { + "one,,three" + |> gsv.to_lists + |> should.be_ok + |> should.equal([["one", "", "three"]]) } -pub fn encode_with_escaped_string_windows_test() { - let assert Ok(lls) = - "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE" - |> gsv.to_lists +pub fn csv_ending_with_an_empty_field_test() { + "one,two," + |> gsv.to_lists + |> should.be_ok + |> should.equal([["one", "two", ""]]) +} - lls - |> gsv.from_lists(separator: ",", line_ending: Windows) - |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE") +pub fn csv_starting_with_an_empty_field_test() { + ",two,three" + |> gsv.to_lists + |> should.be_ok + |> should.equal([["", "two", "three"]]) } -pub fn for_the_readme_test() { - let csv_str = "Hello, World\nGoodbye, Mars" +pub fn escaped_field_test() { + "'gleam','functional' +'erlang','functional'" + // Writing and escaping the double quotes by hand is a bit noisy and makes it + // hard to read the literal string so I prefer to write single quotes + // and replace those before parsing :P + |> string.replace(each: "'", with: "\"") + |> gsv.to_lists + |> should.be_ok + |> should.equal([["gleam", "functional"], ["erlang", "functional"]]) +} - // Parse a CSV string to a List(List(String)) - let assert Ok(records) = gsv.to_lists(csv_str) +pub fn escaped_field_with_newlines_test() { + "'wibble +wobble','wibble'" + |> string.replace(each: "'", with: "\"") + |> gsv.to_lists + |> should.be_ok + |> should.equal([["wibble\nwobble", "wibble"]]) +} - // Write a List(List(String)) to a CSV string - records - |> gsv.from_lists(separator: ",", line_ending: Windows) - |> should.equal("Hello,World\r\nGoodbye,Mars") +pub fn escaped_field_with_crlf_test() { + "'wibble\r +wobble','wibble'" + |> string.replace(each: "'", with: "\"") + |> gsv.to_lists + |> should.be_ok + |> should.equal([["wibble\r\nwobble", "wibble"]]) } -pub fn totally_doesnt_error_test() { - "Ben, 25,, TRUE" +pub fn escaped_field_with_comma_test() { + "'wibble,wobble','wibble'" + |> string.replace(each: "'", with: "\"") |> gsv.to_lists - |> should.equal(Ok([["Ben", "25", "", "TRUE"]])) + |> should.be_ok + |> should.equal([["wibble,wobble", "wibble"]]) } -pub fn trailing_commas_fine_test() { - "Ben, 25, TRUE, Hello\nAustin, 25,\n" +pub fn escaped_field_with_escaped_double_quotes_test() { + "'escaped double quote -> '''" + |> string.replace(each: "'", with: "\"") |> gsv.to_lists - |> should.equal(Ok([["Ben", "25", "TRUE", "Hello"], ["Austin", "25", ""]])) + |> should.be_ok + |> should.equal([["escaped double quote -> \""]]) } +// --- DICT PARSING ------------------------------------------------------------ + pub fn headers_test() { "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" |> gsv.to_dicts @@ -94,14 +140,6 @@ pub fn headers_test() { ]) } -pub fn dicts_round_trip_test() { - "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" - |> gsv.to_dicts - |> should.be_ok - |> gsv.from_dicts(",", Unix) - |> should.equal("age,name\n27,Ben\n27,Austin") -} - pub fn dicts_with_empty_str_header_test() { "name,\" \", ,,age\nBen,foo,bar,baz,27,extra_data" |> gsv.to_dicts @@ -146,3 +184,73 @@ pub fn quotes_test() { ["11/11/2024", "Apples", "7", "5"], ]) } + +// --- TESTING ERRORS ---------------------------------------------------------- + +pub fn double_quote_in_middle_of_field_test() { + "field,other\"field" + |> gsv.to_lists + |> should.be_error + |> should.equal(todo) +} + +pub fn unescaped_double_quote_in_escaped_field_test() { + "'unescaped double quote -> ' in escaped field'" + |> string.replace(each: "'", with: "\"") + |> gsv.to_lists + |> should.be_error + |> should.equal(todo) +} + +pub fn unescaped_carriage_return_test() { + todo as "decide what to do" + "test\n\r" + |> gsv.to_lists + |> should.be_error +} + +// --- ENCODING TESTS ---------------------------------------------------------- + +pub fn encode_test() { + "Ben, 25 +Austin, 21" + |> test_lists_roundtrip(",", Unix) +} + +pub fn encode_with_escaped_string_test() { + "Ben, 25,' TRUE +\r'' ' +Austin, 25, FALSE" + |> string.replace(each: "'", with: "\"") + |> test_lists_roundtrip(",", Unix) +} + +pub fn encode_with_escaped_string_windows_test() { + let assert Ok(lls) = + "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE" + |> gsv.to_lists + + lls + |> gsv.from_lists(separator: ",", line_ending: Windows) + |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE") +} + +pub fn dicts_round_trip_test() { + "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" + |> gsv.to_dicts + |> should.be_ok + |> gsv.from_dicts(",", Unix) + |> should.equal("age,name\n27,Ben\n27,Austin") +} + +// --- TEST HELPERS ------------------------------------------------------------ + +fn test_lists_roundtrip( + input: String, + separator: String, + line_ending: LineEnding, +) -> Nil { + let assert Ok(parsed) = gsv.to_lists(input) + let encoded = gsv.from_lists(parsed, separator, line_ending) + encoded |> should.equal(input) +} From 2d7cebd89457937f95c2bd7c116b0a9f82df1ad1 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 16:05:02 +0200 Subject: [PATCH 09/23] documentation for to_lists --- src/gsv.gleam | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index b467ded..1990eef 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -5,11 +5,24 @@ import gleam/result import gleam/string import gsv/internal/parse -/// Parses a csv string to a list of lists of strings. -/// Automatically handles Windows and Unix line endings. -/// Returns a string error msg if the string is not valid csv. -/// Unquoted strings are trimmed, while quoted strings have leading and trailing -/// whitespace preserved. +/// Parses a csv string into a list of lists of strings. +/// ## Examples +/// +/// ```gleam +/// "hello, world +/// goodbye, mars +/// " +/// |> gsv.to_lists +/// // [["hello", " world"], ["goodbye", " mars"]] +/// ``` +/// +/// > This implementation tries to stick as closely as possible to +/// > [RFC4180](https://www.ietf.org/rfc/rfc4180.txt), with a couple notable +/// > convenience differences: +/// > - both `\n` and `\r\n` line endings are accepted. +/// > - a line can start with an empty field `,two,three`. +/// > - empty lines are allowed and just ignored. +/// pub fn to_lists(input: String) -> Result(List(List(String)), String) { parse.parse(input) |> result.map_error(fn(error) { From 4f290960e6cf691ce0382184c6c428ebdc4987ad Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 16:11:50 +0200 Subject: [PATCH 10/23] remove old implementation --- src/gsv/internal/token.gleam | 103 ----------------------------------- 1 file changed, 103 deletions(-) delete mode 100644 src/gsv/internal/token.gleam diff --git a/src/gsv/internal/token.gleam b/src/gsv/internal/token.gleam deleted file mode 100644 index 98f2aef..0000000 --- a/src/gsv/internal/token.gleam +++ /dev/null @@ -1,103 +0,0 @@ -//// We are using the following grammar for CSV from rfc4180 -//// -//// file = [header CRLF] record *(CRLF record) [CRLF] -//// header = name *(COMMA name) -//// record = field *(COMMA field) -//// name = field -//// field = (escaped / non-escaped) -//// escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE -//// non-escaped = *TEXTDATA - -import gleam/list -import gleam/string - -pub type CsvToken { - Comma - LF - CR - Doublequote - Textdata(inner: String) -} - -pub type Location { - Location(line: Int, column: Int) -} - -pub fn to_lexeme(token: CsvToken) -> String { - case token { - Comma -> "," - LF -> "\n" - CR -> "\r" - Doublequote -> "\"" - Textdata(str) -> str - } -} - -fn len(token: CsvToken) -> Int { - case token { - Comma -> 1 - LF -> 1 - CR -> 1 - Doublequote -> 1 - Textdata(str) -> string.length(str) - } -} - -pub fn scan(input: String) -> List(CsvToken) { - input - |> string.to_utf_codepoints - |> list.fold([], fn(acc, x) { - case string.utf_codepoint_to_int(x) { - 0x2c -> [Comma, ..acc] - 0x22 -> [Doublequote, ..acc] - 0x0a -> [LF, ..acc] - 0x0D -> [CR, ..acc] - _ -> { - let cp = string.from_utf_codepoints([x]) - case acc { - [Textdata(str), ..rest] -> [Textdata(str <> cp), ..rest] - _ -> [Textdata(cp), ..acc] - } - } - } - }) - |> list.reverse -} - -pub fn with_location(input: List(CsvToken)) -> List(#(CsvToken, Location)) { - do_with_location(input, [], Location(1, 1)) - |> list.reverse -} - -fn do_with_location( - input: List(CsvToken), - acc: List(#(CsvToken, Location)), - curr_loc: Location, -) -> List(#(CsvToken, Location)) { - let Location(line, column) = curr_loc - case input { - // Base case, no more tokens - [] -> acc - - // A newline, increment line number - [LF, ..rest] -> { - do_with_location(rest, [#(LF, curr_loc), ..acc], Location(line + 1, 1)) - } - [CR, LF, ..rest] -> { - do_with_location( - rest, - [#(LF, Location(line, column + 1)), #(CR, curr_loc), ..acc], - Location(line + 1, 1), - ) - } - - // Any other token just increment the column - [token, ..rest] -> { - do_with_location( - rest, - [#(token, curr_loc), ..acc], - Location(line, column + len(token)), - ) - } - } -} From 26407da35a011755abac3e3883e77f2d57723205 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 16:18:54 +0200 Subject: [PATCH 11/23] one big file good --- src/gsv.gleam | 275 +++++++++++++++++++++++++++++++++-- src/gsv/internal/parse.gleam | 261 --------------------------------- 2 files changed, 266 insertions(+), 270 deletions(-) delete mode 100644 src/gsv/internal/parse.gleam diff --git a/src/gsv.gleam b/src/gsv.gleam index 1990eef..07b0eec 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -1,9 +1,31 @@ import gleam/dict.{type Dict} -import gleam/io import gleam/list import gleam/result import gleam/string -import gsv/internal/parse + +// --- TYPES ------------------------------------------------------------------- + +pub type ParseError { + /// A field can contain a double quote only if it is escaped (that is, + /// surrounded by double quotes). For example `wobb"le` would be an invalid + /// field, the correct way to write such a field would be like this: + /// `"wobb""le"`. + /// + UnescapedQuote( + /// The byte index of the unescaped double. + position: Int, + ) + + /// This error can occur when the file ends without the closing `"` of an + /// escaped field. For example: `"hello`. + /// + UnclosedEscapedField( + /// The byte index of the start of the unclosed escaped field. + start: Int, + ) +} + +// --- PARSING ----------------------------------------------------------------- /// Parses a csv string into a list of lists of strings. /// ## Examples @@ -23,12 +45,231 @@ import gsv/internal/parse /// > - a line can start with an empty field `,two,three`. /// > - empty lines are allowed and just ignored. /// -pub fn to_lists(input: String) -> Result(List(List(String)), String) { - parse.parse(input) - |> result.map_error(fn(error) { - io.debug(error) - todo as "decide what to do with errors" - }) +pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) { + case input { + // We just ignore all unescaped newlines at the beginning of a file. + "\n" <> rest | "\r\n" <> rest -> to_lists(rest) + // If it starts with a `"` then we know it starts with an escaped field. + "\"" <> rest -> do_parse(rest, input, 1, 0, [], [], ParsingEscapedField) + // If it starts with a `,` then it starts with an empty field we're filling + // out manually. + "," <> rest -> do_parse(rest, input, 1, 0, [""], [], CommaFound) + // Otherwise we just start parsing the first unescaped field. + _ -> do_parse(input, input, 0, 0, [], [], ParsingUnescapedField) + } +} + +/// This is used to keep track of what the parser is doing. +/// +type ParseStatus { + ParsingEscapedField + ParsingUnescapedField + CommaFound + NewlineFound +} + +/// ## What does this scary looking function do? +/// +/// At a high level, it goes over the csv `string` byte-by-byte and parses rows +/// accumulating those into `rows` as it goes. +/// +/// +/// ## Why does it have all these parameters? What does each one do? +/// +/// In order to be extra efficient this function parses the csv file in a single +/// pass and uses string slicing to avoid copying data. +/// Each time we see a new field we keep track of the byte where it starts with +/// `field_start` and then count the bytes (that's the `field_length` variable) +/// until we fiend its end (either a newline, the end of the file, or a `,`). +/// +/// After reaching the end of a field we extract it from the original string +/// taking a slice that goes from `field_start` and has `field_length` bytes. +/// This is where the magic happens: slicing a string this way is a constant +/// time operation and doesn't copy the string so it's crazy fast! +/// +/// `row` is an accumulator with all the fields of the current row as +/// they are parsed. Once we run into a newline `current_row` is added to all +/// the other `rows`. +/// +/// We also keep track of _what_ we're parsing with the `status` to make +/// sure that we're correctly dealing with escaped fields and double quotes. +/// +fn do_parse( + string: String, + original: String, + field_start: Int, + field_length: Int, + row: List(String), + rows: List(List(String)), + status: ParseStatus, +) -> Result(List(List(String)), ParseError) { + case string, status { + // If we find a comma we're done with the current field and can take a slice + // going from `field_start` with `field_length` bytes: + // + // wibble,wobble,... + // ╰────╯ field_length = 6 + // ┬ + // ╰ field_start + // + // After taking the slice we move the slice start _after_ the comma: + // + // wibble,wobble,... + // ┬ + // ╰ field_start = field_start + field_length + 1 (the comma) + // + "," <> rest, CommaFound + | "," <> rest, NewlineFound + | "," <> rest, ParsingUnescapedField + -> { + let field = extract_field(original, field_start, field_length, status) + let row = [field, ..row] + let field_start = field_start + field_length + 1 + do_parse(rest, original, field_start, 0, row, rows, CommaFound) + } + "\"," <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = [field, ..row] + let field_start = field_start + field_length + 2 + do_parse(rest, original, field_start, 0, row, rows, CommaFound) + } + + // When the string is over we're done parsing. + // We take the final field we were in the middle of parsing and add it to + // the current row that is returned together with all the parsed rows. + // + "", ParsingUnescapedField | "\"", ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + Ok(list.reverse([row, ..rows])) + } + + "", CommaFound -> { + let row = list.reverse(["", ..row]) + Ok(list.reverse([row, ..rows])) + } + + "", NewlineFound -> Ok(list.reverse(rows)) + + // If the string is over and we were parsing an escaped field, that's an + // error. We would expect to find a closing double quote before the end of + // the data. + // + "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start)) + + // When we run into a new line (CRLF or just LF) we know we're done with the + // current field and take a slice of it, just like we did in the previous + // branch! + // The only difference is we also add the current `row` to all the other + // ones and start with a new one. + // + // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF. + // > Here we do something slightly different and also accept lines that are + // > delimited by just LF too. + // + // The next three branches are the same except for the new `field_start` + // that has to take into account the different lengths. + // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking + // adding the lenght of that but it had a noticeable (albeit small) impact + // on performance. + // + "\n" <> rest, ParsingUnescapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 1 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 2 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + "\"\r\n" <> rest, ParsingEscapedField -> { + let field = extract_field(original, field_start, field_length, status) + let row = list.reverse([field, ..row]) + let rows = [row, ..rows] + let field_start = field_start + field_length + 3 + do_parse(rest, original, field_start, 0, [], rows, NewlineFound) + } + + // If the newlines is immediately after a comma then the row ends with an + // empty field. + // + "\n" <> rest, CommaFound -> { + let row = list.reverse(["", ..row]) + let rows = [row, ..rows] + do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound) + } + "\r\n" <> rest, CommaFound -> { + let row = list.reverse(["", ..row]) + let rows = [row, ..rows] + do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound) + } + + // If the newline immediately comes after a newline that means we've run + // into an empty line that we can just safely ignore. + // + "\n" <> rest, NewlineFound -> + do_parse(rest, original, field_start + 1, 0, row, rows, status) + "\r\n" <> rest, NewlineFound -> + do_parse(rest, original, field_start + 2, 0, row, rows, status) + + // An escaped quote found while parsing an escaped field. + // + "\"\"" <> rest, ParsingEscapedField -> + do_parse(rest, original, field_start, field_length + 2, row, rows, status) + + // An unescaped quote found while parsing a field. + // + "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField -> + Error(UnescapedQuote(position: field_start + field_length)) + + // If the quote is found immediately after a comma or a newline that signals + // the start of a new escaped field to parse. + // + "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> { + let status = ParsingEscapedField + do_parse(rest, original, field_start + 1, 0, row, rows, status) + } + + // In all other cases we're still parsing a field so we just drop a byte + // from the string we're iterating through, increase the size of the slice + // we need to take and keep going. + // + // > ⚠️ Notice how we're not trying to trim any whitespaces at the + // > beginning or end of a field: RFC 4810 states that "Spaces are + // > considered part of a field and should not be ignored." + // + _, CommaFound + | _, NewlineFound + | _, ParsingUnescapedField + | _, ParsingEscapedField + -> { + let status = case status { + ParsingEscapedField -> ParsingEscapedField + CommaFound | NewlineFound | ParsingUnescapedField -> + ParsingUnescapedField + } + let rest = drop_bytes(string, 1) + do_parse(rest, original, field_start, field_length + 1, row, rows, status) + } + } +} + +fn extract_field( + string: String, + from: Int, + length: Int, + status: ParseStatus, +) -> String { + let field = slice_bytes(string, from, length) + case status { + CommaFound | ParsingUnescapedField | NewlineFound -> field + ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"") + } } /// Parses a csv string to a list of dicts. @@ -39,7 +280,7 @@ pub fn to_lists(input: String) -> Result(List(List(String)), String) { /// Whitespace only or empty strings are not valid headers and will be ignored. /// Whitespace only or empty strings are not considered "present" in the csv row and /// are not inserted into the row dict. -pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) { +pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) { use lol <- result.try(to_lists(input)) case lol { [] -> [] @@ -149,3 +390,19 @@ pub fn from_dicts( } } } + +// --- FFI HELPERS ------------------------------------------------------------- + +/// In general this wouldn't be safe, by just slicing random bytes in the middle +/// of a utf8 string we might end up with something that is not a valid utf8 +/// string. +/// However, the parser only slices fields in between commas so it should always +/// yield valid utf8 slices. +/// +@external(erlang, "gsv_ffi", "slice") +@external(javascript, "../gsv_ffi.mjs", "slice") +fn slice_bytes(string: String, from: Int, length: Int) -> String + +@external(erlang, "gsv_ffi", "drop_bytes") +@external(javascript, "../gsv_ffi.mjs", "drop_bytes") +fn drop_bytes(string: String, bytes: Int) -> String diff --git a/src/gsv/internal/parse.gleam b/src/gsv/internal/parse.gleam deleted file mode 100644 index 2c54d3d..0000000 --- a/src/gsv/internal/parse.gleam +++ /dev/null @@ -1,261 +0,0 @@ -import gleam/list -import gleam/string - -pub fn parse(string) -> Result(List(List(String)), ParseError) { - case string { - // We just ignore all unescaped newlines at the beginning of a file. - "\n" <> rest | "\r\n" <> rest -> parse(rest) - // If it starts with a `"` then we know it starts with an escaped field. - "\"" <> rest -> do_parse(rest, string, 1, 0, [], [], ParsingEscapedField) - // If it starts with a `,` then it starts with an empty field we're filling - // out manually. - "," <> rest -> do_parse(rest, string, 1, 0, [""], [], CommaFound) - // Otherwise we just start parsing the first unescaped field. - _ -> do_parse(string, string, 0, 0, [], [], ParsingUnescapedField) - } -} - -pub type ParseError { - /// A field can contain a double quote only if it is escaped (that is, - /// surrounded by double quotes). For example `wobb"le` would be an invalid - /// field, the correct way to write such a field would be like this: - /// `"wobb""le"`. - /// - UnescapedQuote( - /// The byte index of the unescaped double. - position: Int, - ) - - /// This error can occur when the file ends without the closing `"` of an - /// escaped field. For example: `"hello`. - /// - UnclosedEscapedField( - /// The byte index of the start of the unclosed escaped field. - start: Int, - ) -} - -type ParseStatus { - ParsingEscapedField - ParsingUnescapedField - CommaFound - NewlineFound -} - -/// ## What does this scary looking function do? -/// -/// At a high level, it goes over the csv `string` byte-by-byte and parses rows -/// accumulating those into `rows` as it goes. -/// -/// -/// ## Why does it have all these parameters? What does each one do? -/// -/// In order to be extra efficient this function parses the csv file in a single -/// pass and uses string slicing to avoid copying data. -/// Each time we see a new field we keep track of the byte where it starts with -/// `field_start` and then count the bytes (that's the `field_length` variable) -/// until we fiend its end (either a newline, the end of the file, or a `,`). -/// -/// After reaching the end of a field we extract it from the original string -/// taking a slice that goes from `field_start` and has `field_length` bytes. -/// This is where the magic happens: slicing a string this way is a constant -/// time operation and doesn't copy the string so it's crazy fast! -/// -/// `row` is an accumulator with all the fields of the current row as -/// they are parsed. Once we run into a newline `current_row` is added to all -/// the other `rows`. -/// -/// We also keep track of _what_ we're parsing with the `status` to make -/// sure that we're correctly dealing with escaped fields and double quotes. -/// -fn do_parse( - string: String, - original: String, - field_start: Int, - field_length: Int, - row: List(String), - rows: List(List(String)), - status: ParseStatus, -) -> Result(List(List(String)), ParseError) { - case string, status { - // If we find a comma we're done with the current field and can take a slice - // going from `field_start` with `field_length` bytes: - // - // wibble,wobble,... - // ╰────╯ field_length = 6 - // ┬ - // ╰ field_start - // - // After taking the slice we move the slice start _after_ the comma: - // - // wibble,wobble,... - // ┬ - // ╰ field_start = field_start + field_length + 1 (the comma) - // - "," <> rest, CommaFound - | "," <> rest, NewlineFound - | "," <> rest, ParsingUnescapedField - -> { - let field = extract_field(original, field_start, field_length, status) - let row = [field, ..row] - let field_start = field_start + field_length + 1 - do_parse(rest, original, field_start, 0, row, rows, CommaFound) - } - "\"," <> rest, ParsingEscapedField -> { - let field = extract_field(original, field_start, field_length, status) - let row = [field, ..row] - let field_start = field_start + field_length + 2 - do_parse(rest, original, field_start, 0, row, rows, CommaFound) - } - - // When the string is over we're done parsing. - // We take the final field we were in the middle of parsing and add it to - // the current row that is returned together with all the parsed rows. - // - "", ParsingUnescapedField | "\"", ParsingEscapedField -> { - let field = extract_field(original, field_start, field_length, status) - let row = list.reverse([field, ..row]) - Ok(list.reverse([row, ..rows])) - } - - "", CommaFound -> { - let row = list.reverse(["", ..row]) - Ok(list.reverse([row, ..rows])) - } - - "", NewlineFound -> Ok(list.reverse(rows)) - - // If the string is over and we were parsing an escaped field, that's an - // error. We would expect to find a closing double quote before the end of - // the data. - // - "", ParsingEscapedField -> Error(UnclosedEscapedField(field_start)) - - // When we run into a new line (CRLF or just LF) we know we're done with the - // current field and take a slice of it, just like we did in the previous - // branch! - // The only difference is we also add the current `row` to all the other - // ones and start with a new one. - // - // > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF. - // > Here we do something slightly different and also accept lines that are - // > delimited by just LF too. - // - // The next three branches are the same except for the new `field_start` - // that has to take into account the different lengths. - // I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking - // adding the lenght of that but it had a noticeable (albeit small) impact - // on performance. - // - "\n" <> rest, ParsingUnescapedField -> { - let field = extract_field(original, field_start, field_length, status) - let row = list.reverse([field, ..row]) - let rows = [row, ..rows] - let field_start = field_start + field_length + 1 - do_parse(rest, original, field_start, 0, [], rows, NewlineFound) - } - "\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> { - let field = extract_field(original, field_start, field_length, status) - let row = list.reverse([field, ..row]) - let rows = [row, ..rows] - let field_start = field_start + field_length + 2 - do_parse(rest, original, field_start, 0, [], rows, NewlineFound) - } - "\"\r\n" <> rest, ParsingEscapedField -> { - let field = extract_field(original, field_start, field_length, status) - let row = list.reverse([field, ..row]) - let rows = [row, ..rows] - let field_start = field_start + field_length + 3 - do_parse(rest, original, field_start, 0, [], rows, NewlineFound) - } - - // If the newlines is immediately after a comma then the row ends with an - // empty field. - // - "\n" <> rest, CommaFound -> { - let row = list.reverse(["", ..row]) - let rows = [row, ..rows] - do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound) - } - "\r\n" <> rest, CommaFound -> { - let row = list.reverse(["", ..row]) - let rows = [row, ..rows] - do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound) - } - - // If the newline immediately comes after a newline that means we've run - // into an empty line that we can just safely ignore. - // - "\n" <> rest, NewlineFound -> - do_parse(rest, original, field_start + 1, 0, row, rows, status) - "\r\n" <> rest, NewlineFound -> - do_parse(rest, original, field_start + 2, 0, row, rows, status) - - // An escaped quote found while parsing an escaped field. - // - "\"\"" <> rest, ParsingEscapedField -> - do_parse(rest, original, field_start, field_length + 2, row, rows, status) - - // An unescaped quote found while parsing a field. - // - "\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField -> - Error(UnescapedQuote(position: field_start + field_length)) - - // If the quote is found immediately after a comma or a newline that signals - // the start of a new escaped field to parse. - // - "\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> { - let status = ParsingEscapedField - do_parse(rest, original, field_start + 1, 0, row, rows, status) - } - - // In all other cases we're still parsing a field so we just drop a byte - // from the string we're iterating through, increase the size of the slice - // we need to take and keep going. - // - // > ⚠️ Notice how we're not trying to trim any whitespaces at the - // > beginning or end of a field: RFC 4810 states that "Spaces are - // > considered part of a field and should not be ignored." - // - _, CommaFound - | _, NewlineFound - | _, ParsingUnescapedField - | _, ParsingEscapedField - -> { - let status = case status { - ParsingEscapedField -> ParsingEscapedField - CommaFound | NewlineFound | ParsingUnescapedField -> - ParsingUnescapedField - } - let rest = drop_bytes(string, 1) - do_parse(rest, original, field_start, field_length + 1, row, rows, status) - } - } -} - -fn extract_field( - string: String, - from: Int, - length: Int, - status: ParseStatus, -) -> String { - let field = slice_bytes(string, from, length) - case status { - CommaFound | ParsingUnescapedField | NewlineFound -> field - ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"") - } -} - -/// In general this wouldn't be safe, by just slicing random bytes in the middle -/// of a utf8 string we might end up with something that is not a valid utf8 -/// string. -/// However, the parser only slices fields in between commas so it should always -/// yield valid utf8 slices. -/// -@external(erlang, "gsv_ffi", "slice") -@external(javascript, "../../gsv_ffi.mjs", "slice") -fn slice_bytes(string: String, from: Int, length: Int) -> String - -@external(erlang, "gsv_ffi", "drop_bytes") -@external(javascript, "../../gsv_ffi.mjs", "drop_bytes") -fn drop_bytes(string: String, bytes: Int) -> String From eab6f8c779d73c7dedbb5f9aa416502d97d6e311 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sat, 19 Oct 2024 16:53:38 +0200 Subject: [PATCH 12/23] nice error tests --- .../double_quote_in_middle_of_field.accepted | 9 ++ .../unclosed_escaped_field.accepted | 9 ++ ...ped_double_quote_in_escaped_field.accepted | 9 ++ gleam.toml | 1 + manifest.toml | 16 ++++ test/gsv_test.gleam | 84 +++++++++++++++++-- 6 files changed, 122 insertions(+), 6 deletions(-) create mode 100644 birdie_snapshots/double_quote_in_middle_of_field.accepted create mode 100644 birdie_snapshots/unclosed_escaped_field.accepted create mode 100644 birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted diff --git a/birdie_snapshots/double_quote_in_middle_of_field.accepted b/birdie_snapshots/double_quote_in_middle_of_field.accepted new file mode 100644 index 0000000..05bd5bb --- /dev/null +++ b/birdie_snapshots/double_quote_in_middle_of_field.accepted @@ -0,0 +1,9 @@ +--- +version: 1.2.3 +title: double quote in middle of field +file: ./test/gsv_test.gleam +test_name: double_quote_in_middle_of_field_test +--- +field,other"field + ┬ + ╰─ This is an unescaped double quote \ No newline at end of file diff --git a/birdie_snapshots/unclosed_escaped_field.accepted b/birdie_snapshots/unclosed_escaped_field.accepted new file mode 100644 index 0000000..9407731 --- /dev/null +++ b/birdie_snapshots/unclosed_escaped_field.accepted @@ -0,0 +1,9 @@ +--- +version: 1.2.3 +title: unclosed escaped field +file: ./test/gsv_test.gleam +test_name: unclosed_escaped_field_test +--- +"closed","unclosed + ┬ + ╰─ This escaped field is not closed \ No newline at end of file diff --git a/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted b/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted new file mode 100644 index 0000000..d8a67da --- /dev/null +++ b/birdie_snapshots/unescaped_double_quote_in_escaped_field.accepted @@ -0,0 +1,9 @@ +--- +version: 1.2.3 +title: unescaped double quote in escaped field +file: ./test/gsv_test.gleam +test_name: unescaped_double_quote_in_escaped_field_test +--- +"unescaped double quote -> " in escaped field" + ┬ + ╰─ This is an unescaped double quote \ No newline at end of file diff --git a/gleam.toml b/gleam.toml index 5080dd0..a9fa90a 100644 --- a/gleam.toml +++ b/gleam.toml @@ -16,3 +16,4 @@ gleam_stdlib = ">= 0.40.0 and < 1.0.0" [dev-dependencies] gleeunit = "~> 1.0" +birdie = ">= 1.2.3 and < 2.0.0" diff --git a/manifest.toml b/manifest.toml index 5cfd9ac..9231034 100644 --- a/manifest.toml +++ b/manifest.toml @@ -2,10 +2,26 @@ # You typically do not need to edit this file packages = [ + { name = "argv", version = "1.0.2", build_tools = ["gleam"], requirements = [], otp_app = "argv", source = "hex", outer_checksum = "BA1FF0929525DEBA1CE67256E5ADF77A7CDDFE729E3E3F57A5BDCAA031DED09D" }, + { name = "birdie", version = "1.2.3", build_tools = ["gleam"], requirements = ["argv", "edit_distance", "filepath", "glance", "gleam_community_ansi", "gleam_erlang", "gleam_stdlib", "justin", "rank", "simplifile", "trie_again"], otp_app = "birdie", source = "hex", outer_checksum = "AE1207210E9CC8F4170BCE3FB3C23932F314C352C3FD1BCEA44CF4BF8CF60F93" }, + { name = "edit_distance", version = "2.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "edit_distance", source = "hex", outer_checksum = "A1E485C69A70210223E46E63985FA1008B8B2DDA9848B7897469171B29020C05" }, + { name = "filepath", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "EFB6FF65C98B2A16378ABC3EE2B14124168C0CE5201553DE652E2644DCFDB594" }, + { name = "glance", version = "0.11.0", build_tools = ["gleam"], requirements = ["gleam_stdlib", "glexer"], otp_app = "glance", source = "hex", outer_checksum = "8F3314D27773B7C3B9FB58D8C02C634290422CE531988C0394FA0DF8676B964D" }, + { name = "gleam_community_ansi", version = "1.4.1", build_tools = ["gleam"], requirements = ["gleam_community_colour", "gleam_stdlib"], otp_app = "gleam_community_ansi", source = "hex", outer_checksum = "4CD513FC62523053E62ED7BAC2F36136EC17D6A8942728250A9A00A15E340E4B" }, + { name = "gleam_community_colour", version = "1.4.0", build_tools = ["gleam"], requirements = ["gleam_json", "gleam_stdlib"], otp_app = "gleam_community_colour", source = "hex", outer_checksum = "795964217EBEDB3DA656F5EB8F67D7AD22872EB95182042D3E7AFEF32D3FD2FE" }, + { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" }, + { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" }, { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" }, { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" }, + { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" }, + { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" }, + { name = "rank", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "rank", source = "hex", outer_checksum = "5660E361F0E49CBB714CC57CC4C89C63415D8986F05B2DA0C719D5642FAD91C9" }, + { name = "simplifile", version = "2.2.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0DFABEF7DC7A9E2FF4BB27B108034E60C81BEBFCB7AB816B9E7E18ED4503ACD8" }, + { name = "thoas", version = "1.2.1", build_tools = ["rebar3"], requirements = [], otp_app = "thoas", source = "hex", outer_checksum = "E38697EDFFD6E91BD12CEA41B155115282630075C2A727E7A6B2947F5408B86A" }, + { name = "trie_again", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "trie_again", source = "hex", outer_checksum = "5B19176F52B1BD98831B57FDC97BD1F88C8A403D6D8C63471407E78598E27184" }, ] [requirements] +birdie = { version = ">= 1.2.3 and < 2.0.0" } gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" } gleeunit = { version = "~> 1.0" } diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index ab40085..2a9e9e0 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -1,4 +1,6 @@ +import birdie import gleam/dict +import gleam/list import gleam/string import gleeunit import gleeunit/should @@ -189,17 +191,22 @@ pub fn quotes_test() { pub fn double_quote_in_middle_of_field_test() { "field,other\"field" - |> gsv.to_lists - |> should.be_error - |> should.equal(todo) + |> pretty_print_error + |> birdie.snap("double quote in middle of field") } pub fn unescaped_double_quote_in_escaped_field_test() { "'unescaped double quote -> ' in escaped field'" |> string.replace(each: "'", with: "\"") - |> gsv.to_lists - |> should.be_error - |> should.equal(todo) + |> pretty_print_error + |> birdie.snap("unescaped double quote in escaped field") +} + +pub fn unclosed_escaped_field_test() { + "'closed','unclosed" + |> string.replace(each: "'", with: "\"") + |> pretty_print_error + |> birdie.snap("unclosed escaped field") } pub fn unescaped_carriage_return_test() { @@ -254,3 +261,68 @@ fn test_lists_roundtrip( let encoded = gsv.from_lists(parsed, separator, line_ending) encoded |> should.equal(input) } + +fn pretty_print_error(input: String) -> String { + let assert Error(error) = gsv.to_lists(input) + let error_message = error_to_message(error) + let #(error_line, error_column) = + error_to_position(error) + |> position_to_line_and_column(in: input) + + string.replace(in: input, each: "\r\n", with: "\n") + |> string.split(on: "\n") + |> list.index_map(fn(line, line_number) { + case line_number == error_line { + False -> line + True -> { + let padding = string.repeat(" ", error_column) + let pointer_line = padding <> "┬" + let message_line = padding <> "╰─ " <> error_message + line <> "\n" <> pointer_line <> "\n" <> message_line + } + } + }) + |> string.join(with: "\n") +} + +fn error_to_position(error: gsv.ParseError) -> Int { + case error { + gsv.UnclosedEscapedField(position) | gsv.UnescapedQuote(position) -> + position + } +} + +fn error_to_message(error: gsv.ParseError) -> String { + case error { + gsv.UnclosedEscapedField(_) -> "This escaped field is not closed" + gsv.UnescapedQuote(_) -> "This is an unescaped double quote" + } +} + +fn position_to_line_and_column(position: Int, in string: String) -> #(Int, Int) { + do_position_to_line_and_column(string, position, 0, 0) +} + +fn do_position_to_line_and_column( + string: String, + position: Int, + line: Int, + col: Int, +) -> #(Int, Int) { + case position, string { + 0, _ -> #(line, col) + _, "" -> panic as "position out of string bounds" + _, "\n" <> rest -> + do_position_to_line_and_column(rest, position - 1, line + 1, 0) + _, "\r\n" <> rest -> + do_position_to_line_and_column(rest, position - 2, line + 1, 0) + _, _ -> { + let rest = drop_bytes(string, 1) + do_position_to_line_and_column(rest, position - 1, line, col + 1) + } + } +} + +@external(erlang, "gsv_ffi", "drop_bytes") +@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes") +fn drop_bytes(string: String, bytes: Int) -> String From 8b395890d46ae0d76b116029587992493915ccf2 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 10:21:54 +0100 Subject: [PATCH 13/23] fix ffi bug --- src/gsv_ffi.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsv_ffi.erl b/src/gsv_ffi.erl index acec49e..10f3c64 100644 --- a/src/gsv_ffi.erl +++ b/src/gsv_ffi.erl @@ -7,5 +7,5 @@ slice(String, Index, Length) -> drop_bytes(String, Bytes) -> case String of <<_:Bytes/bytes, Rest/binary>> -> Rest; - <<>> -> <<>> + _ -> String end. From b70c0316accf876aa702e2a21d7763db9c45c2cd Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 11:30:33 +0100 Subject: [PATCH 14/23] to_dicts --- gleam.toml | 1 + manifest.toml | 2 + src/gsv.gleam | 152 +++++++++++++++++++++++++++++--------------- test/gsv_test.gleam | 50 +++++++++++---- 4 files changed, 140 insertions(+), 65 deletions(-) diff --git a/gleam.toml b/gleam.toml index a9fa90a..1e538df 100644 --- a/gleam.toml +++ b/gleam.toml @@ -13,6 +13,7 @@ internal_modules = [ [dependencies] gleam_stdlib = ">= 0.40.0 and < 1.0.0" +glearray = ">= 1.0.0 and < 2.0.0" [dev-dependencies] gleeunit = "~> 1.0" diff --git a/manifest.toml b/manifest.toml index 9231034..bde2f68 100644 --- a/manifest.toml +++ b/manifest.toml @@ -12,6 +12,7 @@ packages = [ { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" }, { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" }, { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" }, + { name = "glearray", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glearray", source = "hex", outer_checksum = "B99767A9BC63EF9CC8809F66C7276042E5EFEACAA5B25188B552D3691B91AC6D" }, { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" }, { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" }, { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" }, @@ -24,4 +25,5 @@ packages = [ [requirements] birdie = { version = ">= 1.2.3 and < 2.0.0" } gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" } +glearray = { version = ">= 1.0.0 and < 2.0.0" } gleeunit = { version = "~> 1.0" } diff --git a/src/gsv.gleam b/src/gsv.gleam index 07b0eec..f2087ab 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -2,17 +2,22 @@ import gleam/dict.{type Dict} import gleam/list import gleam/result import gleam/string +import glearray // --- TYPES ------------------------------------------------------------------- pub type ParseError { + /// This error can occur if there is a csv field contains an unescaped double + /// quote `"`. + /// /// A field can contain a double quote only if it is escaped (that is, - /// surrounded by double quotes). For example `wobb"le` would be an invalid + /// surrounded by double quotes). For example `wibb"le` would be an invalid /// field, the correct way to write such a field would be like this: - /// `"wobb""le"`. + /// `"wibb""le"`. /// UnescapedQuote( /// The byte index of the unescaped double. + /// position: Int, ) @@ -21,21 +26,45 @@ pub type ParseError { /// UnclosedEscapedField( /// The byte index of the start of the unclosed escaped field. + /// start: Int, ) } +/// Possible line endings used when turning a parsed csv back into a string +/// with the `from_lists` and `from_dicts` functions. +/// +pub type LineEnding { + /// The CRLF line ending: `\r\n`. + /// + Windows + + /// The LF line ending: `\n`. + Unix +} + +fn le_to_string(le: LineEnding) -> String { + case le { + Windows -> "\r\n" + Unix -> "\n" + } +} + // --- PARSING ----------------------------------------------------------------- -/// Parses a csv string into a list of lists of strings. +/// Parses a csv string into a list of lists of strings: each line of the csv +/// will be turned into a list with an item for each field. +/// /// ## Examples /// /// ```gleam /// "hello, world -/// goodbye, mars -/// " +/// goodbye, mars" /// |> gsv.to_lists -/// // [["hello", " world"], ["goodbye", " mars"]] +/// // Ok([ +/// // ["hello", " world"], +/// // ["goodbye", " mars"], +/// // ]) /// ``` /// /// > This implementation tries to stick as closely as possible to @@ -62,9 +91,22 @@ pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) { /// This is used to keep track of what the parser is doing. /// type ParseStatus { + /// We're in the middle of parsing an escaped csv field (that is, starting + /// and ending with `"`). + /// ParsingEscapedField + + /// We're in the middle of parsing a regular csv field. + /// ParsingUnescapedField + + /// We've just ran into a (non escaped) comma, signalling the end of a field. + /// CommaFound + + /// We've just ran into a (non escaped) newline (either a `\n` or `\r\n`), + /// signalling the end of a line and the start of a new one. + /// NewlineFound } @@ -268,60 +310,64 @@ fn extract_field( let field = slice_bytes(string, from, length) case status { CommaFound | ParsingUnescapedField | NewlineFound -> field + // If we were parsing an escaped field then escaped quotes must be replaced + // with a single one. ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"") } } -/// Parses a csv string to a list of dicts. -/// Automatically handles Windows and Unix line endings. -/// Returns a string error msg if the string is not valid csv. -/// Unquoted strings are trimmed, while quoted strings have leading and trailing -/// whitespace preserved. -/// Whitespace only or empty strings are not valid headers and will be ignored. -/// Whitespace only or empty strings are not considered "present" in the csv row and -/// are not inserted into the row dict. +/// Parses a csv string into a list of dicts: the first line of the csv is +/// interpreted as the headers' row and each of the following lines is turned +/// into a dict with a value for each of the headers. +/// +/// If a field is empty then it won't be added to the dict. +/// +/// ## Examples +/// +/// ```gleam +/// "pet,name,cuteness +/// dog,Fido,100 +/// cat,,1000 +/// " +/// |> gsv.to_dicts +/// // Ok([ +/// // dict.from_list([ +/// // #("pet", "dog"), #("name", "Fido"), #("cuteness", "100") +/// // ]), +/// // dict.from_list([ +/// // #("pet", "cat"), #("cuteness", "1000") +/// // ]), +/// // ]) +/// ``` +/// +/// > Just list `to_lists` this implementation tries to stick as closely as +/// > possible to [RFC4180](https://www.ietf.org/rfc/rfc4180.txt). +/// > You can look at `to_lists`' documentation to see how it differs from the +/// > RFC. +/// pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) { - use lol <- result.try(to_lists(input)) - case lol { + use rows <- result.map(to_lists(input)) + case rows { [] -> [] [headers, ..rows] -> { - let headers = - list.index_fold(headers, dict.new(), fn(acc, x, i) { - case string.trim(x) == "" { - True -> acc - False -> dict.insert(acc, i, x) - } - }) + let headers = glearray.from_list(headers) - list.map(rows, fn(row) { - use acc, x, i <- list.index_fold(row, dict.new()) - case dict.get(headers, i) { - Error(Nil) -> acc - Ok(h) -> - case string.trim(x) { - "" -> acc - t -> dict.insert(acc, string.trim(h), t) - } - } - }) + use row <- list.map(rows) + use row, field, index <- list.index_fold(row, dict.new()) + case field { + // If the field is empty then we don't add it to the row's dict. + "" -> row + _ -> + // We look for the header corresponding to this field's position. + case glearray.get(headers, index) { + Ok(header) -> dict.insert(row, header, field) + // This could happen if the row has more fields than headers in the + // header row, in this case the field is just discarded + Error(_) -> row + } + } } } - |> Ok -} - -/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows" -/// line endings. Use with the `from_lists` function when -/// writing to a csv string. -pub type LineEnding { - Windows - Unix -} - -fn le_to_string(le: LineEnding) -> String { - case le { - Windows -> "\r\n" - Unix -> "\n" - } } /// Takes a list of lists of strings and writes it to a csv string. @@ -329,6 +375,7 @@ fn le_to_string(le: LineEnding) -> String { /// line endings with double quotes (in csv, double quotes get escaped by doing /// a double doublequote) /// The string `he"llo\n` becomes `"he""llo\n"` +/// pub fn from_lists( input: List(List(String)), separator separator: String, @@ -360,6 +407,7 @@ pub fn from_lists( /// line endings with double quotes (in csv, double quotes get escaped by doing /// a double doublequote) /// The string `he"llo\n` becomes `"he""llo\n"` +/// pub fn from_dicts( input: List(Dict(String, String)), separator separator: String, @@ -400,9 +448,9 @@ pub fn from_dicts( /// yield valid utf8 slices. /// @external(erlang, "gsv_ffi", "slice") -@external(javascript, "../gsv_ffi.mjs", "slice") +@external(javascript, "./gsv_ffi.mjs", "slice") fn slice_bytes(string: String, from: Int, length: Int) -> String @external(erlang, "gsv_ffi", "drop_bytes") -@external(javascript, "../gsv_ffi.mjs", "drop_bytes") +@external(javascript, "./gsv_ffi.mjs", "drop_bytes") fn drop_bytes(string: String, bytes: Int) -> String diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 2a9e9e0..96a88b3 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -133,7 +133,9 @@ pub fn escaped_field_with_escaped_double_quotes_test() { // --- DICT PARSING ------------------------------------------------------------ pub fn headers_test() { - "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" + "name,age +Ben,27,TRUE,Hello +Austin,27," |> gsv.to_dicts |> should.be_ok |> should.equal([ @@ -143,20 +145,30 @@ pub fn headers_test() { } pub fn dicts_with_empty_str_header_test() { - "name,\" \", ,,age\nBen,foo,bar,baz,27,extra_data" + "name,\" \", ,,age +Ben,wibble,wobble,woo,27,extra_data" |> gsv.to_dicts |> should.be_ok - |> gsv.from_dicts(",", Unix) - |> should.equal("age,name\n27,Ben") + |> should.equal([ + dict.from_list([ + #("name", "Ben"), + #(" ", "wibble"), + #(" ", "wobble"), + #("", "woo"), + #("age", "27"), + ]), + ]) } pub fn dicts_with_empty_values_test() { - "name, age\nBen,,,,\nAustin, 27" + "name,age +Ben,,,, +Austin,27" |> gsv.to_dicts |> should.be_ok |> should.equal([ dict.from_list([#("name", "Ben")]), - dict.from_list([#("age", "27"), #("name", "Austin")]), + dict.from_list([#("name", "Austin"), #("age", "27")]), ]) } @@ -233,21 +245,33 @@ Austin, 25, FALSE" } pub fn encode_with_escaped_string_windows_test() { - let assert Ok(lls) = - "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE" + let assert Ok(rows) = + "Ben, 25,' TRUE\n\r'' ' +Austin, 25, FALSE" + |> string.replace(each: "'", with: "\"") |> gsv.to_lists - lls + rows |> gsv.from_lists(separator: ",", line_ending: Windows) - |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE") + |> string.replace(each: "\"", with: "'") + |> should.equal( + "Ben, 25,' TRUE\n\r'' '\r +Austin, 25, FALSE", + ) } pub fn dicts_round_trip_test() { - "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" + "name,age +Ben,27,TRUE,Hello +Austin,27," |> gsv.to_dicts |> should.be_ok |> gsv.from_dicts(",", Unix) - |> should.equal("age,name\n27,Ben\n27,Austin") + |> should.equal( + "age,name +27,Ben +27,Austin", + ) } // --- TEST HELPERS ------------------------------------------------------------ @@ -324,5 +348,5 @@ fn do_position_to_line_and_column( } @external(erlang, "gsv_ffi", "drop_bytes") -@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes") +@external(javascript, "./gsv_ffi.mjs", "drop_bytes") fn drop_bytes(string: String, bytes: Int) -> String From f7225099f7aca9f69789043e2e0bf15ac8acfc61 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 12:42:51 +0100 Subject: [PATCH 15/23] from_lists --- src/gsv.gleam | 129 +++++++++++++++++++++++++++++++++++--------- test/gsv_test.gleam | 7 --- 2 files changed, 104 insertions(+), 32 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index f2087ab..199098d 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -43,7 +43,7 @@ pub type LineEnding { Unix } -fn le_to_string(le: LineEnding) -> String { +fn line_ending_to_string(le: LineEnding) -> String { case le { Windows -> "\r\n" Unix -> "\n" @@ -370,36 +370,115 @@ pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) } } -/// Takes a list of lists of strings and writes it to a csv string. -/// Will automatically escape strings that contain double quotes or -/// line endings with double quotes (in csv, double quotes get escaped by doing -/// a double doublequote) -/// The string `he"llo\n` becomes `"he""llo\n"` +/// Takes a list of lists of strings and turns it to a csv string, automatically +/// escaping all fields that contain double quotes or line endings. +/// +/// ## Examples +/// +/// ```gleam +/// let rows = [["hello", "world"], ["goodbye", "mars"]] +/// from_lists(rows, separator: ",", line_ending: Unix) +/// // "hello,world +/// // goodbye,mars" +/// ``` +/// +/// ```gleam +/// let rows = [[]] +/// ``` /// pub fn from_lists( - input: List(List(String)), + rows: List(List(String)), separator separator: String, line_ending line_ending: LineEnding, ) -> String { - input - |> list.map(fn(row) { - list.map(row, fn(entry) { - // Double quotes need to be escaped with an extra doublequote - let entry = string.replace(entry, "\"", "\"\"") - - // If the string contains a , \n \r\n or " it needs to be escaped by wrapping in double quotes - case - string.contains(entry, separator) - || string.contains(entry, "\n") - || string.contains(entry, "\"") - { - True -> "\"" <> entry <> "\"" - False -> entry + let line_ending = line_ending_to_string(line_ending) + do_from_lists(rows, separator, line_ending, "") +} + +fn do_from_lists( + rows: List(List(String)), + separator: String, + line_ending: String, + acc: String, +) -> String { + case rows { + [] -> acc + // When we're down to the last row, we don't add a final newline at the end + // of the string. So we special handle this case and pass in an empty string + // as the `line_ending` to add to the row. + [last_row] -> row_to_string(last_row, separator, "", acc) + // For all other cases we just accumulate the line string onto the string + // accumulator. + [row, ..rest] -> { + let acc = row_to_string(row, separator, line_ending, acc) + do_from_lists(rest, separator, line_ending, acc) + } + } +} + +fn row_to_string( + row: List(String), + separator: String, + line_ending: String, + acc: String, +) -> String { + case row { + [] -> acc + // When we're down to the last field of the row we need to add the line + // ending instead of the field separator. So we special handle this case. + [last_field] -> acc <> escape_field(last_field, separator) <> line_ending + // For all other cases we add the field to the accumulator and append a + // separator to separate it from the next field in the row. + [field, ..rest] -> { + let acc = acc <> escape_field(field, separator) <> separator + row_to_string(rest, separator, line_ending, acc) + } + } +} + +/// The kind of escaping needed by a csv field. +/// +type Escaping { + NoEscaping + WrapInDoubleQuotes + WrapInDoubleQuotesAndEscapeDoubleQuotes +} + +fn escape_field(field: String, separator: String) -> String { + case escaping(field, separator) { + NoEscaping -> field + WrapInDoubleQuotes -> "\"" <> field <> "\"" + WrapInDoubleQuotesAndEscapeDoubleQuotes -> + "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\"" + } +} + +fn escaping(string: String, separator: String) -> Escaping { + do_escaping(string, separator, NoEscaping) +} + +fn do_escaping(string: String, separator: String, kind: Escaping) { + case string { + // As soon as we find a double quote we know that we must escape the double + // quotes and wrap it in double quotes, no need to keep going through the + // string. + "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes + // If we find a newline we know the string must at least be wrapped in + // double quotes but we keep going in case we find a `"`. + "\n" <> rest -> do_escaping(rest, separator, WrapInDoubleQuotes) + // If we reach the end of the string we return the accumulator. + "" -> kind + // In all other cases we check if the string starts with the separator, in + // that case we know it must be at least wrapped in double quotes. + // But we keep going in case we find a `"`. + _ -> { + let assert Ok(#(_, rest)) = string.pop_grapheme(string) + case kind == WrapInDoubleQuotes || string.starts_with(string, separator) { + True -> do_escaping(rest, separator, WrapInDoubleQuotes) + False -> do_escaping(rest, separator, kind) } - }) - }) - |> list.map(fn(row) { string.join(row, separator) }) - |> string.join(le_to_string(line_ending)) + } + } } /// Takes a list of dicts and writes it to a csv string. diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 96a88b3..ea69529 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -221,13 +221,6 @@ pub fn unclosed_escaped_field_test() { |> birdie.snap("unclosed escaped field") } -pub fn unescaped_carriage_return_test() { - todo as "decide what to do" - "test\n\r" - |> gsv.to_lists - |> should.be_error -} - // --- ENCODING TESTS ---------------------------------------------------------- pub fn encode_test() { From 8fa68b0551cfa4e978cc52e9c4317b45154cf237 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 12:58:11 +0100 Subject: [PATCH 16/23] from_dicts --- src/gsv.gleam | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index 199098d..d2611f7 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -488,36 +488,36 @@ fn do_escaping(string: String, separator: String, kind: Escaping) { /// The string `he"llo\n` becomes `"he""llo\n"` /// pub fn from_dicts( - input: List(Dict(String, String)), + rows: List(Dict(String, String)), separator separator: String, line_ending line_ending: LineEnding, ) -> String { - case input { + case rows { [] -> "" _ -> { let headers = - input - |> list.map(dict.keys) - |> list.flatten + rows + |> list.flat_map(dict.keys) |> list.unique |> list.sort(string.compare) - let rows = - list.map(input, fn(row) { - list.fold(headers, [], fn(acc, h) { - case dict.get(row, h) { - Ok(v) -> [v, ..acc] - Error(Nil) -> ["", ..acc] - } - }) - }) - |> list.map(list.reverse) - + let rows = list.map(rows, row_dict_to_list(_, headers)) from_lists([headers, ..rows], separator, line_ending) } } } +fn row_dict_to_list( + row: Dict(String, String), + headers: List(String), +) -> List(String) { + use header <- list.map(headers) + case dict.get(row, header) { + Ok(field) -> field + Error(Nil) -> "" + } +} + // --- FFI HELPERS ------------------------------------------------------------- /// In general this wouldn't be safe, by just slicing random bytes in the middle From 55ef9eeab9a50e7151e7cb48649171ab6df2515d Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 13:53:55 +0100 Subject: [PATCH 17/23] rebase gone wrong --- src/gsv/internal/ast.gleam | 196 ------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 src/gsv/internal/ast.gleam diff --git a/src/gsv/internal/ast.gleam b/src/gsv/internal/ast.gleam deleted file mode 100644 index 56f667f..0000000 --- a/src/gsv/internal/ast.gleam +++ /dev/null @@ -1,196 +0,0 @@ -//// We are using the following grammar for CSV from rfc4180 -//// -//// file = [header CRLF] record *(CRLF record) [CRLF] -//// header = name *(COMMA name) -//// record = field *(COMMA field) -//// name = field -//// field = (escaped / non-escaped) -//// escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE -//// non-escaped = *TEXTDATA - -import gleam/list -import gleam/result -import gleam/string -import gsv/internal/token.{ - type CsvToken, type Location, CR, Comma, Doublequote, LF, Location, Textdata, -} - -type ParseState { - Beginning - JustParsedField - JustParsedComma - JustParsedNewline - JustParsedCR - InsideEscapedString -} - -pub type ParseError { - ParseError(location: Location, message: String) -} - -pub fn parse( - input: List(#(CsvToken, Location)), -) -> Result(List(List(String)), ParseError) { - let inner_rev = { - use llf <- result.try(parse_p(input, Beginning, [])) - use lf <- list.try_map(llf) - Ok(list.reverse(lf)) - } - use ir <- result.try(inner_rev) - Ok(list.reverse(ir)) -} - -fn parse_p( - input: List(#(CsvToken, Location)), - parse_state: ParseState, - llf: List(List(String)), -) -> Result(List(List(String)), ParseError) { - case input, parse_state, llf { - // Error Case: An empty list should produce an Error - [], Beginning, _ -> Error(ParseError(Location(0, 0), "Empty input")) - - // BASE CASE: We are done parsing tokens - [], _, llf -> Ok(llf) - - // File should begin with either Escaped or Nonescaped string - [#(Textdata(str), _), ..remaining_tokens], Beginning, [] -> - parse_p(remaining_tokens, JustParsedField, [[string.trim(str)]]) - - [#(Doublequote, _), ..remaining_tokens], Beginning, [] -> - parse_p(remaining_tokens, InsideEscapedString, [[""]]) - - [#(tok, loc), ..], Beginning, _ -> - Error(ParseError( - loc, - "Unexpected start to csv content: " <> token.to_lexeme(tok), - )) - - // If we just parsed a field, we're expecting either a comma or a CRLF - [#(Comma, _), ..remaining_tokens], JustParsedField, llf -> - parse_p(remaining_tokens, JustParsedComma, llf) - - [#(LF, _), ..remaining_tokens], JustParsedField, llf -> - parse_p(remaining_tokens, JustParsedNewline, llf) - - [#(CR, _), ..remaining_tokens], JustParsedField, llf -> - parse_p(remaining_tokens, JustParsedCR, llf) - - [#(tok, loc), ..], JustParsedField, _ -> - Error(ParseError( - loc, - "Expected comma or newline after field, found: " <> token.to_lexeme(tok), - )) - - // If we just parsed a CR, we're expecting an LF - [#(LF, _), ..remaining_tokens], JustParsedCR, llf -> - parse_p(remaining_tokens, JustParsedNewline, llf) - - [#(tok, loc), ..], JustParsedCR, _ -> - Error(ParseError( - loc, - "Expected \"\\n\" after \"\\r\", found: " <> token.to_lexeme(tok), - )) - - // If we just parsed a comma, we're expecting an Escaped or Non-Escaped string, or another comma - // (indicating an empty string) - [#(Textdata(str), _), ..remaining_tokens], - JustParsedComma, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, JustParsedField, [ - [string.trim(str), ..curr_line], - ..previously_parsed_lines - ]) - - [#(Doublequote, _), ..remaining_tokens], - JustParsedComma, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, InsideEscapedString, [ - ["", ..curr_line], - ..previously_parsed_lines - ]) - - [#(Comma, _), ..remaining_tokens], - JustParsedComma, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, JustParsedComma, [ - ["", ..curr_line], - ..previously_parsed_lines - ]) - - [#(CR, _), ..remaining_tokens], - JustParsedComma, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, JustParsedCR, [ - ["", ..curr_line], - ..previously_parsed_lines - ]) - - [#(LF, _), ..remaining_tokens], - JustParsedComma, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, JustParsedNewline, [ - ["", ..curr_line], - ..previously_parsed_lines - ]) - - [#(tok, loc), ..], JustParsedComma, _ -> - Error(ParseError( - loc, - "Expected escaped or non-escaped string after comma, found: " - <> token.to_lexeme(tok), - )) - - // If we just parsed a new line, we're expecting an escaped or non-escaped string - [#(Textdata(str), _), ..remaining_tokens], JustParsedNewline, llf -> - parse_p(remaining_tokens, JustParsedField, [[string.trim(str)], ..llf]) - - [#(Doublequote, _), ..remaining_tokens], - JustParsedNewline, - [curr_line, ..previously_parsed_lines] - -> - parse_p(remaining_tokens, InsideEscapedString, [ - [""], - curr_line, - ..previously_parsed_lines - ]) - - [#(tok, loc), ..], JustParsedNewline, _ -> - Error(ParseError( - loc, - "Expected escaped or non-escaped string after newline, found: " - <> token.to_lexeme(tok), - )) - - // If we're inside an escaped string, we can take anything until we get a double quote, - // but a double double quote "" escapes the double quote and we keep parsing - [#(Doublequote, _), #(Doublequote, _), ..remaining_tokens], - InsideEscapedString, - [[str, ..rest_curr_line], ..previously_parsed_lines] - -> - parse_p(remaining_tokens, InsideEscapedString, [ - [str <> "\"", ..rest_curr_line], - ..previously_parsed_lines - ]) - - [#(Doublequote, _), ..remaining_tokens], InsideEscapedString, llf -> - parse_p(remaining_tokens, JustParsedField, llf) - - [#(other_token, _), ..remaining_tokens], - InsideEscapedString, - [[str, ..rest_curr_line], ..previously_parsed_lines] - -> - parse_p(remaining_tokens, InsideEscapedString, [ - [str <> token.to_lexeme(other_token), ..rest_curr_line], - ..previously_parsed_lines - ]) - - // Anything else is an error - [#(tok, loc), ..], _, _ -> - Error(ParseError(loc, "Unexpected token: " <> token.to_lexeme(tok))) - } -} From a1b15f46be2ce3238b4c4239c06db59b63b89ccb Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 13:58:44 +0100 Subject: [PATCH 18/23] changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb0a433..d024a31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog ## Unreleased +- Improved performance of `to_lists`, `to_dicts`, `from_lists` and `from_lists`. +- Parsing now doesn't trim the csv fields, conforming to RFC4180. +- The `to_lists` and `to_dicts` functions now return a structured error instead + of a `String`. ## v2.0.3 - 25 October 2024 - Patch to remove some unused imports. From 864fc8dfc62ce0bdeed5bb404abfb2e1aad8876a Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 14:22:34 +0100 Subject: [PATCH 19/23] readme --- README.md | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 1c8570b..7507a3f 100644 --- a/README.md +++ b/README.md @@ -3,42 +3,30 @@ [![Package Version](https://img.shields.io/hexpm/v/gsv)](https://hex.pm/packages/gsv) [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/gsv/) -This is a simple csv parser and writer for Gleam. It will get more performant/battle tested in the future, -but if you're looking for that now, I'd recommend doing ffi to an existing parser in your target runtime. - -#### Example +A simple csv parser and serialiser for Gleam. ```gleam import gsv.{Unix, Windows} pub fn main() { - let csv_str = - "Hello,World -Goodbye,Mars" - - // Parse a CSV string to a List(List(String)) - let assert Ok(records) = gsv.to_lists(csv_str) + let csv = + "name,loves +lucy,gleam" - // Write a List(List(String)) to a CSV string - let csv_str = records - |> gsv.from_lists(separator: ",", line_ending: Windows) + // Parse a csv string into a list of rows. + let assert Ok(rows) = gsv.to_lists(csv) + // -> [["name", "loves"], ["lucy", "gleam"]] - // Parse a CSV string with headers to a List(Dict(String, String)) - let assert Ok(records) = gsv.to_dicts(csv_str) - // => [ dict.from_list([ #("Hello", "Goodbye"), #("World", "Mars") ]) ] - - // Write a List(Dict(String, String)) to a CSV string, treating the keys as the header row - let csv_str = records - |> gsv.from_dicts(separator: ",", line_ending: Windows) + // If your csv has headers you can also parse it into a list of dictionaries. + let assert Ok(rows) = gsv.to_dicts(csv_str) + // -> dict.from_list([#("name", "lucy"), #("loves", "gleam")]) } ``` ## Installation -If available on Hex this package can be added to your Gleam project: +To add this package to your Gleam project: ```sh -gleam add gsv +gleam add gsv@3 ``` - -and its documentation can be found at . From 7d607f449f79436afd48683fc8fc7116a02e5736 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 15:05:20 +0100 Subject: [PATCH 20/23] improve from_lists --- src/gsv.gleam | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index d2611f7..dbecb82 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -392,15 +392,17 @@ pub fn from_lists( line_ending line_ending: LineEnding, ) -> String { let line_ending = line_ending_to_string(line_ending) - do_from_lists(rows, separator, line_ending, "") + do_from_lists(rows, separator, line_ending, []) + |> list.reverse + |> string.join(with: "") } fn do_from_lists( rows: List(List(String)), separator: String, line_ending: String, - acc: String, -) -> String { + acc: List(String), +) -> List(String) { case rows { [] -> acc // When we're down to the last row, we don't add a final newline at the end @@ -420,17 +422,19 @@ fn row_to_string( row: List(String), separator: String, line_ending: String, - acc: String, -) -> String { + acc: List(String), +) -> List(String) { case row { [] -> acc + // When we're down to the last field of the row we need to add the line // ending instead of the field separator. So we special handle this case. - [last_field] -> acc <> escape_field(last_field, separator) <> line_ending + [last_field] -> [line_ending, escape_field(last_field, separator), ..acc] + // For all other cases we add the field to the accumulator and append a // separator to separate it from the next field in the row. [field, ..rest] -> { - let acc = acc <> escape_field(field, separator) <> separator + let acc = [separator, escape_field(field, separator), ..acc] row_to_string(rest, separator, line_ending, acc) } } @@ -454,10 +458,13 @@ fn escape_field(field: String, separator: String) -> String { } fn escaping(string: String, separator: String) -> Escaping { - do_escaping(string, separator, NoEscaping) + case string.contains(string, separator) { + True -> do_escaping(string, WrapInDoubleQuotes) + False -> do_escaping(string, NoEscaping) + } } -fn do_escaping(string: String, separator: String, kind: Escaping) { +fn do_escaping(string: String, kind: Escaping) { case string { // As soon as we find a double quote we know that we must escape the double // quotes and wrap it in double quotes, no need to keep going through the @@ -465,19 +472,11 @@ fn do_escaping(string: String, separator: String, kind: Escaping) { "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes // If we find a newline we know the string must at least be wrapped in // double quotes but we keep going in case we find a `"`. - "\n" <> rest -> do_escaping(rest, separator, WrapInDoubleQuotes) + "\n" <> rest -> do_escaping(rest, WrapInDoubleQuotes) // If we reach the end of the string we return the accumulator. "" -> kind - // In all other cases we check if the string starts with the separator, in - // that case we know it must be at least wrapped in double quotes. - // But we keep going in case we find a `"`. - _ -> { - let assert Ok(#(_, rest)) = string.pop_grapheme(string) - case kind == WrapInDoubleQuotes || string.starts_with(string, separator) { - True -> do_escaping(rest, separator, WrapInDoubleQuotes) - False -> do_escaping(rest, separator, kind) - } - } + // In all other cases we keep looking. + _ -> do_escaping(drop_bytes(string, 1), kind) } } From 852d4226efa9dc34e1d0b530f9f04330bbf86dad Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 15:21:22 +0100 Subject: [PATCH 21/23] improve from_lists --- src/gsv.gleam | 93 +++++++-------------------------------------------- 1 file changed, 12 insertions(+), 81 deletions(-) diff --git a/src/gsv.gleam b/src/gsv.gleam index dbecb82..9bf2068 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -392,91 +392,22 @@ pub fn from_lists( line_ending line_ending: LineEnding, ) -> String { let line_ending = line_ending_to_string(line_ending) - do_from_lists(rows, separator, line_ending, []) - |> list.reverse - |> string.join(with: "") -} - -fn do_from_lists( - rows: List(List(String)), - separator: String, - line_ending: String, - acc: List(String), -) -> List(String) { - case rows { - [] -> acc - // When we're down to the last row, we don't add a final newline at the end - // of the string. So we special handle this case and pass in an empty string - // as the `line_ending` to add to the row. - [last_row] -> row_to_string(last_row, separator, "", acc) - // For all other cases we just accumulate the line string onto the string - // accumulator. - [row, ..rest] -> { - let acc = row_to_string(row, separator, line_ending, acc) - do_from_lists(rest, separator, line_ending, acc) - } - } -} - -fn row_to_string( - row: List(String), - separator: String, - line_ending: String, - acc: List(String), -) -> List(String) { - case row { - [] -> acc - - // When we're down to the last field of the row we need to add the line - // ending instead of the field separator. So we special handle this case. - [last_field] -> [line_ending, escape_field(last_field, separator), ..acc] - - // For all other cases we add the field to the accumulator and append a - // separator to separate it from the next field in the row. - [field, ..rest] -> { - let acc = [separator, escape_field(field, separator), ..acc] - row_to_string(rest, separator, line_ending, acc) - } - } -} -/// The kind of escaping needed by a csv field. -/// -type Escaping { - NoEscaping - WrapInDoubleQuotes - WrapInDoubleQuotesAndEscapeDoubleQuotes + list.map(rows, fn(row) { + list.map(row, escape_field(_, separator)) + |> string.join(with: separator) + }) + |> string.join(with: line_ending) } fn escape_field(field: String, separator: String) -> String { - case escaping(field, separator) { - NoEscaping -> field - WrapInDoubleQuotes -> "\"" <> field <> "\"" - WrapInDoubleQuotesAndEscapeDoubleQuotes -> - "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\"" - } -} - -fn escaping(string: String, separator: String) -> Escaping { - case string.contains(string, separator) { - True -> do_escaping(string, WrapInDoubleQuotes) - False -> do_escaping(string, NoEscaping) - } -} - -fn do_escaping(string: String, kind: Escaping) { - case string { - // As soon as we find a double quote we know that we must escape the double - // quotes and wrap it in double quotes, no need to keep going through the - // string. - "\"" <> _ -> WrapInDoubleQuotesAndEscapeDoubleQuotes - // If we find a newline we know the string must at least be wrapped in - // double quotes but we keep going in case we find a `"`. - "\n" <> rest -> do_escaping(rest, WrapInDoubleQuotes) - // If we reach the end of the string we return the accumulator. - "" -> kind - // In all other cases we keep looking. - _ -> do_escaping(drop_bytes(string, 1), kind) + case string.contains(field, "\"") { + True -> "\"" <> string.replace(in: field, each: "\"", with: "\"\"") <> "\"" + False -> + case string.contains(field, separator) || string.contains(field, "\n") { + True -> "\"" <> field <> "\"" + False -> field + } } } From 28c4fa208d9f01e8781f2f43329cfc2805d72f84 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 15:25:24 +0100 Subject: [PATCH 22/23] document other differences --- src/gsv.gleam | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gsv.gleam b/src/gsv.gleam index 9bf2068..215f9a5 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -73,6 +73,8 @@ fn line_ending_to_string(le: LineEnding) -> String { /// > - both `\n` and `\r\n` line endings are accepted. /// > - a line can start with an empty field `,two,three`. /// > - empty lines are allowed and just ignored. +/// > - lines are not forced to all have the same number of fields. +/// > - a line can end with a comma (meaning its last field is empty). /// pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) { case input { From dd9cfbb5dbf127ce5edd0c8870db6f35642631c9 Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 15:27:52 +0100 Subject: [PATCH 23/23] add test for line with different number of fields --- test/gsv_test.gleam | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index ea69529..18f463e 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -130,6 +130,14 @@ pub fn escaped_field_with_escaped_double_quotes_test() { |> should.equal([["escaped double quote -> \""]]) } +pub fn rows_with_different_number_of_fields_test() { + "three,fields,woo +only,two" + |> gsv.to_lists + |> should.be_ok + |> should.equal([["three", "fields", "woo"], ["only", "two"]]) +} + // --- DICT PARSING ------------------------------------------------------------ pub fn headers_test() {