Skip to content

Commit

Permalink
single files good
Browse files Browse the repository at this point in the history
  • Loading branch information
giacomocavalieri committed Oct 19, 2024
1 parent 442020e commit 545b7e2
Show file tree
Hide file tree
Showing 2 changed files with 267 additions and 270 deletions.
276 changes: 267 additions & 9 deletions src/gsv.gleam
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
import gleam/dict.{type Dict}
import gleam/io
import gleam/list
import gleam/result
import gleam/string
import gsv/internal/parse

// --- TYPES -------------------------------------------------------------------

pub type ParseError {
/// A field can contain a double quote only if it is escaped (that is,
/// surrounded by double quotes). For example `wobb"le` would be an invalid
/// field, the correct way to write such a field would be like this:
/// `"wobb""le"`.
///
UnescapedQuote(
/// The byte index of the unescaped double.
position: Int,
)

/// This error can occur when the file ends without the closing `"` of an
/// escaped field. For example: `"hello`.
///
UnclosedEscapedField(
/// The byte index of the start of the unclosed escaped field.
start: Int,
)
}

// --- PARSING -----------------------------------------------------------------

/// Parses a csv string into a list of lists of strings.
/// ## Examples
Expand All @@ -23,12 +45,232 @@ import gsv/internal/parse
/// > - a line can start with an empty field `,two,three`.
/// > - empty lines are allowed and just ignored.
///
pub fn to_lists(input: String) -> Result(List(List(String)), String) {
parse.parse(input)
|> result.map_error(fn(error) {
io.debug(error)
todo as "decide what to do with errors"
})
pub fn to_lists(input: String) -> Result(List(List(String)), ParseError) {
case input {
// We just ignore all unescaped newlines at the beginning of a file.
"\n" <> rest | "\r\n" <> rest -> to_lists(rest)
// If it starts with a `"` then we know it starts with an escaped field.
"\"" <> rest -> do_parse(rest, input, 1, 0, [], [], ParsingEscapedField)
// If it starts with a `,` then it starts with an empty field we're filling
// out manually.
"," <> rest -> do_parse(rest, input, 1, 0, [""], [], CommaFound)
// Otherwise we just start parsing the first unescaped field.
_ -> do_parse(input, input, 0, 0, [], [], ParsingUnescapedField)
}
|> result.map_error(fn(_) { todo as "decide what to do with errors" })
}

/// This is used to keep track of what the parser is doing.
///
type ParseStatus {
ParsingEscapedField
ParsingUnescapedField
CommaFound
NewlineFound
}

/// ## What does this scary looking function do?
///
/// At a high level, it goes over the csv `string` byte-by-byte and parses rows
/// accumulating those into `rows` as it goes.
///
///
/// ## Why does it have all these parameters? What does each one do?
///
/// In order to be extra efficient this function parses the csv file in a single
/// pass and uses string slicing to avoid copying data.
/// Each time we see a new field we keep track of the byte where it starts with
/// `field_start` and then count the bytes (that's the `field_length` variable)
/// until we fiend its end (either a newline, the end of the file, or a `,`).
///
/// After reaching the end of a field we extract it from the original string
/// taking a slice that goes from `field_start` and has `field_length` bytes.
/// This is where the magic happens: slicing a string this way is a constant
/// time operation and doesn't copy the string so it's crazy fast!
///
/// `row` is an accumulator with all the fields of the current row as
/// they are parsed. Once we run into a newline `current_row` is added to all
/// the other `rows`.
///
/// We also keep track of _what_ we're parsing with the `status` to make
/// sure that we're correctly dealing with escaped fields and double quotes.
///
fn do_parse(
string: String,
original: String,
field_start: Int,
field_length: Int,
row: List(String),
rows: List(List(String)),
status: ParseStatus,
) -> Result(List(List(String)), ParseError) {
case string, status {
// If we find a comma we're done with the current field and can take a slice
// going from `field_start` with `field_length` bytes:
//
// wibble,wobble,...
// ╰────╯ field_length = 6
// ┬
// ╰ field_start
//
// After taking the slice we move the slice start _after_ the comma:
//
// wibble,wobble,...
// ┬
// ╰ field_start = field_start + field_length + 1 (the comma)
//
"," <> rest, CommaFound
| "," <> rest, NewlineFound
| "," <> rest, ParsingUnescapedField
-> {
let field = extract_field(original, field_start, field_length, status)
let row = [field, ..row]
let field_start = field_start + field_length + 1
do_parse(rest, original, field_start, 0, row, rows, CommaFound)
}
"\"," <> rest, ParsingEscapedField -> {
let field = extract_field(original, field_start, field_length, status)
let row = [field, ..row]
let field_start = field_start + field_length + 2
do_parse(rest, original, field_start, 0, row, rows, CommaFound)
}

// When the string is over we're done parsing.
// We take the final field we were in the middle of parsing and add it to
// the current row that is returned together with all the parsed rows.
//
"", ParsingUnescapedField | "\"", ParsingEscapedField -> {
let field = extract_field(original, field_start, field_length, status)
let row = list.reverse([field, ..row])
Ok(list.reverse([row, ..rows]))
}

"", CommaFound -> {
let row = list.reverse(["", ..row])
Ok(list.reverse([row, ..rows]))
}

"", NewlineFound -> Ok(list.reverse(rows))

// If the string is over and we were parsing an escaped field, that's an
// error. We would expect to find a closing double quote before the end of
// the data.
//
"", ParsingEscapedField -> Error(UnclosedEscapedField(field_start))

// When we run into a new line (CRLF or just LF) we know we're done with the
// current field and take a slice of it, just like we did in the previous
// branch!
// The only difference is we also add the current `row` to all the other
// ones and start with a new one.
//
// > ⚠️ As for RFC 4180 lines should only be delimited by a CRLF.
// > Here we do something slightly different and also accept lines that are
// > delimited by just LF too.
//
// The next three branches are the same except for the new `field_start`
// that has to take into account the different lengths.
// I tried writing it as `"\n" as sep | "\r\n" as sep | ...` and then taking
// adding the lenght of that but it had a noticeable (albeit small) impact
// on performance.
//
"\n" <> rest, ParsingUnescapedField -> {
let field = extract_field(original, field_start, field_length, status)
let row = list.reverse([field, ..row])
let rows = [row, ..rows]
let field_start = field_start + field_length + 1
do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
}
"\r\n" <> rest, ParsingUnescapedField | "\"\n" <> rest, ParsingEscapedField -> {
let field = extract_field(original, field_start, field_length, status)
let row = list.reverse([field, ..row])
let rows = [row, ..rows]
let field_start = field_start + field_length + 2
do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
}
"\"\r\n" <> rest, ParsingEscapedField -> {
let field = extract_field(original, field_start, field_length, status)
let row = list.reverse([field, ..row])
let rows = [row, ..rows]
let field_start = field_start + field_length + 3
do_parse(rest, original, field_start, 0, [], rows, NewlineFound)
}

// If the newlines is immediately after a comma then the row ends with an
// empty field.
//
"\n" <> rest, CommaFound -> {
let row = list.reverse(["", ..row])
let rows = [row, ..rows]
do_parse(rest, original, field_start + 1, 0, [], rows, NewlineFound)
}
"\r\n" <> rest, CommaFound -> {
let row = list.reverse(["", ..row])
let rows = [row, ..rows]
do_parse(rest, original, field_start + 2, 0, [], rows, NewlineFound)
}

// If the newline immediately comes after a newline that means we've run
// into an empty line that we can just safely ignore.
//
"\n" <> rest, NewlineFound ->
do_parse(rest, original, field_start + 1, 0, row, rows, status)
"\r\n" <> rest, NewlineFound ->
do_parse(rest, original, field_start + 2, 0, row, rows, status)

// An escaped quote found while parsing an escaped field.
//
"\"\"" <> rest, ParsingEscapedField ->
do_parse(rest, original, field_start, field_length + 2, row, rows, status)

// An unescaped quote found while parsing a field.
//
"\"" <> _, ParsingUnescapedField | "\"" <> _, ParsingEscapedField ->
Error(UnescapedQuote(position: field_start + field_length))

// If the quote is found immediately after a comma or a newline that signals
// the start of a new escaped field to parse.
//
"\"" <> rest, CommaFound | "\"" <> rest, NewlineFound -> {
let status = ParsingEscapedField
do_parse(rest, original, field_start + 1, 0, row, rows, status)
}

// In all other cases we're still parsing a field so we just drop a byte
// from the string we're iterating through, increase the size of the slice
// we need to take and keep going.
//
// > ⚠️ Notice how we're not trying to trim any whitespaces at the
// > beginning or end of a field: RFC 4810 states that "Spaces are
// > considered part of a field and should not be ignored."
//
_, CommaFound
| _, NewlineFound
| _, ParsingUnescapedField
| _, ParsingEscapedField
-> {
let status = case status {
ParsingEscapedField -> ParsingEscapedField
CommaFound | NewlineFound | ParsingUnescapedField ->
ParsingUnescapedField
}
let rest = drop_bytes(string, 1)
do_parse(rest, original, field_start, field_length + 1, row, rows, status)
}
}
}

fn extract_field(
string: String,
from: Int,
length: Int,
status: ParseStatus,
) -> String {
let field = slice_bytes(string, from, length)
case status {
CommaFound | ParsingUnescapedField | NewlineFound -> field
ParsingEscapedField -> string.replace(in: field, each: "\"\"", with: "\"")
}
}

/// Parses a csv string to a list of dicts.
Expand All @@ -39,7 +281,7 @@ pub fn to_lists(input: String) -> Result(List(List(String)), String) {
/// Whitespace only or empty strings are not valid headers and will be ignored.
/// Whitespace only or empty strings are not considered "present" in the csv row and
/// are not inserted into the row dict.
pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), String) {
pub fn to_dicts(input: String) -> Result(List(Dict(String, String)), ParseError) {
use lol <- result.try(to_lists(input))
case lol {
[] -> []
Expand Down Expand Up @@ -149,3 +391,19 @@ pub fn from_dicts(
}
}
}

// --- FFI HELPERS -------------------------------------------------------------

/// In general this wouldn't be safe, by just slicing random bytes in the middle
/// of a utf8 string we might end up with something that is not a valid utf8
/// string.
/// However, the parser only slices fields in between commas so it should always
/// yield valid utf8 slices.
///
@external(erlang, "gsv_ffi", "slice")
@external(javascript, "../gsv_ffi.mjs", "slice")
fn slice_bytes(string: String, from: Int, length: Int) -> String

@external(erlang, "gsv_ffi", "drop_bytes")
@external(javascript, "../gsv_ffi.mjs", "drop_bytes")
fn drop_bytes(string: String, bytes: Int) -> String
Loading

0 comments on commit 545b7e2

Please sign in to comment.