From 27e5f9120b83e9cf527be11e4bf8e35848780f7e Mon Sep 17 00:00:00 2001 From: Giacomo Cavalieri Date: Sun, 27 Oct 2024 11:30:33 +0100 Subject: [PATCH] to_dicts --- gleam.toml | 1 + manifest.toml | 2 + src/gsv.gleam | 152 +++++++++++++++++++++++++++++--------------- test/gsv_test.gleam | 50 +++++++++++---- 4 files changed, 140 insertions(+), 65 deletions(-) diff --git a/gleam.toml b/gleam.toml index 5620a5f..95e1cb3 100644 --- a/gleam.toml +++ b/gleam.toml @@ -13,6 +13,7 @@ internal_modules = [ [dependencies] gleam_stdlib = ">= 0.40.0 and < 1.0.0" +glearray = ">= 1.0.0 and < 2.0.0" [dev-dependencies] gleeunit = "~> 1.0" diff --git a/manifest.toml b/manifest.toml index 9231034..bde2f68 100644 --- a/manifest.toml +++ b/manifest.toml @@ -12,6 +12,7 @@ packages = [ { name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" }, { name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" }, { name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" }, + { name = "glearray", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glearray", source = "hex", outer_checksum = "B99767A9BC63EF9CC8809F66C7276042E5EFEACAA5B25188B552D3691B91AC6D" }, { name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" }, { name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" }, { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" }, @@ -24,4 +25,5 @@ packages = [ [requirements] birdie = { version = ">= 1.2.3 and < 2.0.0" } gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" } +glearray = { version = ">= 1.0.0 and < 2.0.0" } gleeunit = { version = "~> 1.0" } diff --git a/src/gsv.gleam b/src/gsv.gleam index 07b0eec..f2087ab 100644 --- a/src/gsv.gleam +++ b/src/gsv.gleam @@ -2,17 +2,22 @@ import gleam/dict.{type Dict} import gleam/list import gleam/result import gleam/string +import glearray // --- TYPES ------------------------------------------------------------------- pub type ParseError { + /// This error can occur if there is a csv field contains an unescaped double + /// quote `"`. + /// /// A field can contain a double quote only if it is escaped (that is, - /// surrounded by double quotes). For example `wobb"le` would be an invalid + /// surrounded by double quotes). For example `wibb"le` would be an invalid /// field, the correct way to write such a field would be like this: - /// `"wobb""le"`. + /// `"wibb""le"`. /// UnescapedQuote( /// The byte index of the unescaped double. + /// position: Int, ) @@ -21,21 +26,45 @@ pub type ParseError { /// UnclosedEscapedField( /// The byte index of the start of the unclosed escaped field. + /// start: Int, ) } +/// Possible line endings used when turning a parsed csv back into a string +/// with the `from_lists` and `from_dicts` functions. +/// +pub type LineEnding { + /// The CRLF line ending: `\r\n`. + /// + Windows + + /// The LF line ending: `\n`. + Unix +} + +fn le_to_string(le: LineEnding) -> String { + case le { + Windows -> "\r\n" + Unix -> "\n" + } +} + // --- PARSING ----------------------------------------------------------------- -/// Parses a csv string into a list of lists of strings. +/// Parses a csv string into a list of lists of strings: each line of the csv +/// will be turned into a list with an item for each field. +/// /// ## Examples /// /// ```gleam /// "hello, world -/// goodbye, mars -/// " +/// goodbye, mars" /// |> gsv.to_lists -/// // [["hello", " world"], ["goodbye", " mars"]] +/// // Ok([ +/// // ["hello", " world"], +/// // ["goodbye", " mars"], +/// // ]) /// ``` /// /// > This implementation tries to stick as closely as possible to @@ -62,9 +91,22 @@ pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) { /// This is used to keep track of what the parser is doing. /// type ParseStatus { + /// We're in the middle of parsing an escaped csv field (that is, starting + /// and ending with `"`). + /// ParsingEscapedField + + /// We're in the middle of parsing a regular csv field. + /// ParsingUnescapedField + + /// We've just ran into a (non escaped) comma, signalling the end of a field. + /// CommaFound + + /// We've just ran into a (non escaped) newline (either a `\n` or `\r\n`), + /// signalling the end of a line and the start of a new one. + /// NewlineFound } @@ -268,60 +310,64 @@ fn extract_field( let field = slice_bytes(string, from, length) case status { CommaFound | ParsingUnescapedField | NewlineFound -> field + // If we were parsing an escaped field then escaped quotes must be replaced + // with a single one. ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"") } } -/// Parses a csv string to a list of dicts. -/// Automatically handles Windows and Unix line endings. -/// Returns a string error msg if the string is not valid csv. -/// Unquoted strings are trimmed, while quoted strings have leading and trailing -/// whitespace preserved. -/// Whitespace only or empty strings are not valid headers and will be ignored. -/// Whitespace only or empty strings are not considered "present" in the csv row and -/// are not inserted into the row dict. +/// Parses a csv string into a list of dicts: the first line of the csv is +/// interpreted as the headers' row and each of the following lines is turned +/// into a dict with a value for each of the headers. +/// +/// If a field is empty then it won't be added to the dict. +/// +/// ## Examples +/// +/// ```gleam +/// "pet,name,cuteness +/// dog,Fido,100 +/// cat,,1000 +/// " +/// |> gsv.to_dicts +/// // Ok([ +/// // dict.from_list([ +/// // #("pet", "dog"), #("name", "Fido"), #("cuteness", "100") +/// // ]), +/// // dict.from_list([ +/// // #("pet", "cat"), #("cuteness", "1000") +/// // ]), +/// // ]) +/// ``` +/// +/// > Just list `to_lists` this implementation tries to stick as closely as +/// > possible to [RFC4180](https://www.ietf.org/rfc/rfc4180.txt). +/// > You can look at `to_lists`' documentation to see how it differs from the +/// > RFC. +/// pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) { - use lol <- result.try(to_lists(input)) - case lol { + use rows <- result.map(to_lists(input)) + case rows { [] -> [] [headers, ..rows] -> { - let headers = - list.index_fold(headers, dict.new(), fn(acc, x, i) { - case string.trim(x) == "" { - True -> acc - False -> dict.insert(acc, i, x) - } - }) + let headers = glearray.from_list(headers) - list.map(rows, fn(row) { - use acc, x, i <- list.index_fold(row, dict.new()) - case dict.get(headers, i) { - Error(Nil) -> acc - Ok(h) -> - case string.trim(x) { - "" -> acc - t -> dict.insert(acc, string.trim(h), t) - } - } - }) + use row <- list.map(rows) + use row, field, index <- list.index_fold(row, dict.new()) + case field { + // If the field is empty then we don't add it to the row's dict. + "" -> row + _ -> + // We look for the header corresponding to this field's position. + case glearray.get(headers, index) { + Ok(header) -> dict.insert(row, header, field) + // This could happen if the row has more fields than headers in the + // header row, in this case the field is just discarded + Error(_) -> row + } + } } } - |> Ok -} - -/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows" -/// line endings. Use with the `from_lists` function when -/// writing to a csv string. -pub type LineEnding { - Windows - Unix -} - -fn le_to_string(le: LineEnding) -> String { - case le { - Windows -> "\r\n" - Unix -> "\n" - } } /// Takes a list of lists of strings and writes it to a csv string. @@ -329,6 +375,7 @@ fn le_to_string(le: LineEnding) -> String { /// line endings with double quotes (in csv, double quotes get escaped by doing /// a double doublequote) /// The string `he"llo\n` becomes `"he""llo\n"` +/// pub fn from_lists( input: List(List(String)), separator separator: String, @@ -360,6 +407,7 @@ pub fn from_lists( /// line endings with double quotes (in csv, double quotes get escaped by doing /// a double doublequote) /// The string `he"llo\n` becomes `"he""llo\n"` +/// pub fn from_dicts( input: List(Dict(String, String)), separator separator: String, @@ -400,9 +448,9 @@ pub fn from_dicts( /// yield valid utf8 slices. /// @external(erlang, "gsv_ffi", "slice") -@external(javascript, "../gsv_ffi.mjs", "slice") +@external(javascript, "./gsv_ffi.mjs", "slice") fn slice_bytes(string: String, from: Int, length: Int) -> String @external(erlang, "gsv_ffi", "drop_bytes") -@external(javascript, "../gsv_ffi.mjs", "drop_bytes") +@external(javascript, "./gsv_ffi.mjs", "drop_bytes") fn drop_bytes(string: String, bytes: Int) -> String diff --git a/test/gsv_test.gleam b/test/gsv_test.gleam index 25c4a88..a249099 100644 --- a/test/gsv_test.gleam +++ b/test/gsv_test.gleam @@ -133,7 +133,9 @@ pub fn escaped_field_with_escaped_double_quotes_test() { // --- DICT PARSING ------------------------------------------------------------ pub fn headers_test() { - "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" + "name,age +Ben,27,TRUE,Hello +Austin,27," |> gsv.to_dicts |> should.be_ok |> should.equal([ @@ -143,20 +145,30 @@ pub fn headers_test() { } pub fn dicts_with_empty_str_header_test() { - "name,\" \", ,,age\nBen,foo,bar,baz,27,extra_data" + "name,\" \", ,,age +Ben,wibble,wobble,woo,27,extra_data" |> gsv.to_dicts |> should.be_ok - |> gsv.from_dicts(",", Unix) - |> should.equal("age,name\n27,Ben") + |> should.equal([ + dict.from_list([ + #("name", "Ben"), + #(" ", "wibble"), + #(" ", "wobble"), + #("", "woo"), + #("age", "27"), + ]), + ]) } pub fn dicts_with_empty_values_test() { - "name, age\nBen,,,,\nAustin, 27" + "name,age +Ben,,,, +Austin,27" |> gsv.to_dicts |> should.be_ok |> should.equal([ dict.from_list([#("name", "Ben")]), - dict.from_list([#("age", "27"), #("name", "Austin")]), + dict.from_list([#("name", "Austin"), #("age", "27")]), ]) } @@ -221,21 +233,33 @@ Austin, 25, FALSE" } pub fn encode_with_escaped_string_windows_test() { - let assert Ok(lls) = - "Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE" + let assert Ok(rows) = + "Ben, 25,' TRUE\n\r'' ' +Austin, 25, FALSE" + |> string.replace(each: "'", with: "\"") |> gsv.to_lists - lls + rows |> gsv.from_lists(separator: ",", line_ending: Windows) - |> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE") + |> string.replace(each: "\"", with: "'") + |> should.equal( + "Ben, 25,' TRUE\n\r'' '\r +Austin, 25, FALSE", + ) } pub fn dicts_round_trip_test() { - "name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n" + "name,age +Ben,27,TRUE,Hello +Austin,27," |> gsv.to_dicts |> should.be_ok |> gsv.from_dicts(",", Unix) - |> should.equal("age,name\n27,Ben\n27,Austin") + |> should.equal( + "age,name +27,Ben +27,Austin", + ) } // --- TEST HELPERS ------------------------------------------------------------ @@ -312,5 +336,5 @@ fn do_position_to_line_and_column( } @external(erlang, "gsv_ffi", "drop_bytes") -@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes") +@external(javascript, "./gsv_ffi.mjs", "drop_bytes") fn drop_bytes(string: String, bytes: Int) -> String