Skip to content

Commit

Permalink
to_dicts
Browse files Browse the repository at this point in the history
  • Loading branch information
giacomocavalieri committed Oct 27, 2024
1 parent 2dcfe61 commit 27e5f91
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 65 deletions.
1 change: 1 addition & 0 deletions gleam.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ internal_modules = [

[dependencies]
gleam_stdlib = ">= 0.40.0 and < 1.0.0"
glearray = ">= 1.0.0 and < 2.0.0"

[dev-dependencies]
gleeunit = "~> 1.0"
Expand Down
2 changes: 2 additions & 0 deletions manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ packages = [
{ name = "gleam_erlang", version = "0.27.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "DE468F676D71B313C6C8C5334425CFCF827837333F8AB47B64D8A6D7AA40185D" },
{ name = "gleam_json", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "thoas"], otp_app = "gleam_json", source = "hex", outer_checksum = "9063D14D25406326C0255BDA0021541E797D8A7A12573D849462CAFED459F6EB" },
{ name = "gleam_stdlib", version = "0.40.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "86606B75A600BBD05E539EB59FABC6E307EEEA7B1E5865AFB6D980A93BCB2181" },
{ name = "glearray", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glearray", source = "hex", outer_checksum = "B99767A9BC63EF9CC8809F66C7276042E5EFEACAA5B25188B552D3691B91AC6D" },
{ name = "gleeunit", version = "1.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "F7A7228925D3EE7D0813C922E062BFD6D7E9310F0BEE585D3A42F3307E3CFD13" },
{ name = "glexer", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "glexer", source = "hex", outer_checksum = "BD477AD657C2B637FEF75F2405FAEFFA533F277A74EF1A5E17B55B1178C228FB" },
{ name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" },
Expand All @@ -24,4 +25,5 @@ packages = [
[requirements]
birdie = { version = ">= 1.2.3 and < 2.0.0" }
gleam_stdlib = { version = ">= 0.40.0 and < 1.0.0" }
glearray = { version = ">= 1.0.0 and < 2.0.0" }
gleeunit = { version = "~> 1.0" }
152 changes: 100 additions & 52 deletions src/gsv.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,22 @@ import gleam/dict.{type Dict}
import gleam/list
import gleam/result
import gleam/string
import glearray

// --- TYPES -------------------------------------------------------------------

pub type ParseError {
/// This error can occur if there is a csv field contains an unescaped double
/// quote `"`.
///
/// A field can contain a double quote only if it is escaped (that is,
/// surrounded by double quotes). For example `wobb"le` would be an invalid
/// surrounded by double quotes). For example `wibb"le` would be an invalid
/// field, the correct way to write such a field would be like this:
/// `"wobb""le"`.
/// `"wibb""le"`.
///
UnescapedQuote(
/// The byte index of the unescaped double.
///
position: Int,
)

Expand All @@ -21,21 +26,45 @@ pub type ParseError {
///
UnclosedEscapedField(
/// The byte index of the start of the unclosed escaped field.
///
start: Int,
)
}

/// Possible line endings used when turning a parsed csv back into a string
/// with the `from_lists` and `from_dicts` functions.
///
pub type LineEnding {
/// The CRLF line ending: `\r\n`.
///
Windows

/// The LF line ending: `\n`.
Unix
}

fn le_to_string(le: LineEnding) -> String {
case le {
Windows -> "\r\n"
Unix -> "\n"
}
}

// --- PARSING -----------------------------------------------------------------

/// Parses a csv string into a list of lists of strings.
/// Parses a csv string into a list of lists of strings: each line of the csv
/// will be turned into a list with an item for each field.
///
/// ## Examples
///
/// ```gleam
/// "hello, world
/// goodbye, mars
/// "
/// goodbye, mars"
/// |> gsv.to_lists
/// // [["hello", " world"], ["goodbye", " mars"]]
/// // Ok([
/// // ["hello", " world"],
/// // ["goodbye", " mars"],
/// // ])
/// ```
///
/// > This implementation tries to stick as closely as possible to
Expand All @@ -62,9 +91,22 @@ pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
/// This is used to keep track of what the parser is doing.
///
type ParseStatus {
/// We're in the middle of parsing an escaped csv field (that is, starting
/// and ending with `"`).
///
ParsingEscapedField

/// We're in the middle of parsing a regular csv field.
///
ParsingUnescapedField

/// We've just ran into a (non escaped) comma, signalling the end of a field.
///
CommaFound

/// We've just ran into a (non escaped) newline (either a `\n` or `\r\n`),
/// signalling the end of a line and the start of a new one.
///
NewlineFound
}

Expand Down Expand Up @@ -268,67 +310,72 @@ fn extract_field(
let field = slice_bytes(string, from, length)
case status {
CommaFound | ParsingUnescapedField | NewlineFound -> field
// If we were parsing an escaped field then escaped quotes must be replaced
// with a single one.
ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
}
}

/// Parses a csv string to a list of dicts.
/// Automatically handles Windows and Unix line endings.
/// Returns a string error msg if the string is not valid csv.
/// Unquoted strings are trimmed, while quoted strings have leading and trailing
/// whitespace preserved.
/// Whitespace only or empty strings are not valid headers and will be ignored.
/// Whitespace only or empty strings are not considered "present" in the csv row and
/// are not inserted into the row dict.
/// Parses a csv string into a list of dicts: the first line of the csv is
/// interpreted as the headers' row and each of the following lines is turned
/// into a dict with a value for each of the headers.
///
/// If a field is empty then it won't be added to the dict.
///
/// ## Examples
///
/// ```gleam
/// "pet,name,cuteness
/// dog,Fido,100
/// cat,,1000
/// "
/// |> gsv.to_dicts
/// // Ok([
/// // dict.from_list([
/// // #("pet", "dog"), #("name", "Fido"), #("cuteness", "100")
/// // ]),
/// // dict.from_list([
/// // #("pet", "cat"), #("cuteness", "1000")
/// // ]),
/// // ])
/// ```
///
/// > Just list `to_lists` this implementation tries to stick as closely as
/// > possible to [RFC4180](https://www.ietf.org/rfc/rfc4180.txt).
/// > You can look at `to_lists`' documentation to see how it differs from the
/// > RFC.
///
pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) {
use lol <- result.try(to_lists(input))
case lol {
use rows <- result.map(to_lists(input))
case rows {
[] -> []
[headers, ..rows] -> {
let headers =
list.index_fold(headers, dict.new(), fn(acc, x, i) {
case string.trim(x) == "" {
True -> acc
False -> dict.insert(acc, i, x)
}
})
let headers = glearray.from_list(headers)

list.map(rows, fn(row) {
use acc, x, i <- list.index_fold(row, dict.new())
case dict.get(headers, i) {
Error(Nil) -> acc
Ok(h) ->
case string.trim(x) {
"" -> acc
t -> dict.insert(acc, string.trim(h), t)
}
}
})
use row <- list.map(rows)
use row, field, index <- list.index_fold(row, dict.new())
case field {
// If the field is empty then we don't add it to the row's dict.
"" -> row
_ ->
// We look for the header corresponding to this field's position.
case glearray.get(headers, index) {
Ok(header) -> dict.insert(row, header, field)
// This could happen if the row has more fields than headers in the
// header row, in this case the field is just discarded
Error(_) -> row
}
}
}
}
|> Ok
}

/// Option for using "\n = LF = Unix" or "\r\n = CRLF = Windows"
/// line endings. Use with the `from_lists` function when
/// writing to a csv string.
pub type LineEnding {
Windows
Unix
}

fn le_to_string(le: LineEnding) -> String {
case le {
Windows -> "\r\n"
Unix -> "\n"
}
}

/// Takes a list of lists of strings and writes it to a csv string.
/// Will automatically escape strings that contain double quotes or
/// line endings with double quotes (in csv, double quotes get escaped by doing
/// a double doublequote)
/// The string `he"llo\n` becomes `"he""llo\n"`
///
pub fn from_lists(
input: List(List(String)),
separator separator: String,
Expand Down Expand Up @@ -360,6 +407,7 @@ pub fn from_lists(
/// line endings with double quotes (in csv, double quotes get escaped by doing
/// a double doublequote)
/// The string `he"llo\n` becomes `"he""llo\n"`
///
pub fn from_dicts(
input: List(Dict(String, String)),
separator separator: String,
Expand Down Expand Up @@ -400,9 +448,9 @@ pub fn from_dicts(
/// yield valid utf8 slices.
///
@external(erlang, "gsv_ffi", "slice")
@external(javascript, "../gsv_ffi.mjs", "slice")
@external(javascript, "./gsv_ffi.mjs", "slice")
fn slice_bytes(string: String, from: Int, length: Int) -> String

@external(erlang, "gsv_ffi", "drop_bytes")
@external(javascript, "../gsv_ffi.mjs", "drop_bytes")
@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
fn drop_bytes(string: String, bytes: Int) -> String
50 changes: 37 additions & 13 deletions test/gsv_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ pub fn escaped_field_with_escaped_double_quotes_test() {
// --- DICT PARSING ------------------------------------------------------------

pub fn headers_test() {
"name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
"name,age
Ben,27,TRUE,Hello
Austin,27,"
|> gsv.to_dicts
|> should.be_ok
|> should.equal([
Expand All @@ -143,20 +145,30 @@ pub fn headers_test() {
}

pub fn dicts_with_empty_str_header_test() {
"name,\" \", ,,age\nBen,foo,bar,baz,27,extra_data"
"name,\" \", ,,age
Ben,wibble,wobble,woo,27,extra_data"
|> gsv.to_dicts
|> should.be_ok
|> gsv.from_dicts(",", Unix)
|> should.equal("age,name\n27,Ben")
|> should.equal([
dict.from_list([
#("name", "Ben"),
#(" ", "wibble"),
#(" ", "wobble"),
#("", "woo"),
#("age", "27"),
]),
])
}

pub fn dicts_with_empty_values_test() {
"name, age\nBen,,,,\nAustin, 27"
"name,age
Ben,,,,
Austin,27"
|> gsv.to_dicts
|> should.be_ok
|> should.equal([
dict.from_list([#("name", "Ben")]),
dict.from_list([#("age", "27"), #("name", "Austin")]),
dict.from_list([#("name", "Austin"), #("age", "27")]),
])
}

Expand Down Expand Up @@ -221,21 +233,33 @@ Austin, 25, FALSE"
}

pub fn encode_with_escaped_string_windows_test() {
let assert Ok(lls) =
"Ben, 25,\" TRUE\n\r\"\" \"\nAustin, 25, FALSE"
let assert Ok(rows) =
"Ben, 25,' TRUE\n\r'' '
Austin, 25, FALSE"
|> string.replace(each: "'", with: "\"")
|> gsv.to_lists

lls
rows
|> gsv.from_lists(separator: ",", line_ending: Windows)
|> should.equal("Ben,25,\" TRUE\n\r\"\" \"\r\nAustin,25,FALSE")
|> string.replace(each: "\"", with: "'")
|> should.equal(
"Ben, 25,' TRUE\n\r'' '\r
Austin, 25, FALSE",
)
}

pub fn dicts_round_trip_test() {
"name, age\nBen, 27, TRUE, Hello\nAustin, 27,\n"
"name,age
Ben,27,TRUE,Hello
Austin,27,"
|> gsv.to_dicts
|> should.be_ok
|> gsv.from_dicts(",", Unix)
|> should.equal("age,name\n27,Ben\n27,Austin")
|> should.equal(
"age,name
27,Ben
27,Austin",
)
}

// --- TEST HELPERS ------------------------------------------------------------
Expand Down Expand Up @@ -312,5 +336,5 @@ fn do_position_to_line_and_column(
}

@external(erlang, "gsv_ffi", "drop_bytes")
@external(javascript, "../src/gsv_ffi.mjs", "drop_bytes")
@external(javascript, "./gsv_ffi.mjs", "drop_bytes")
fn drop_bytes(string: String, bytes: Int) -> String

0 comments on commit 27e5f91

Please sign in to comment.