Skip to content

Commit

Permalink
Added patch for whitespace trimming (#94)
Browse files Browse the repository at this point in the history
* Fix issue with whitespace parsing

* Update Catch + added more test cases

* Update test_read_csv.cpp

* Update CMake settings

* Update version + single header files

* Update README.md
  • Loading branch information
vincentlaucsb committed May 4, 2020
1 parent 351b646 commit 4ccef57
Show file tree
Hide file tree
Showing 9 changed files with 4,111 additions and 1,240 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ add_subdirectory("programs")

## Developer settings
if (CSV_DEVELOPER)
# Allow for performance profiling
if (MSVC)
target_link_options(csv PUBLIC /PROFILE)
endif()

# Generate a single header library
find_package(Python3 QUIET)
if(Python3_Interpreter_FOUND)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ do not hesitate to report it.

## Documentation

In addition to the [Features & Examples](#features--examples) below, a [fully-fledged online documentation](http://vincela.com/csv) contains more examples, details, interesting features, and instructions for less common use cases.
In addition to the [Features & Examples](#features--examples) below, a [fully-fledged online documentation](https://vincentlaucsb.github.io/csv-parser/html/) contains more examples, details, interesting features, and instructions for less common use cases.

## Integration

Expand Down
2 changes: 1 addition & 1 deletion include/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
CSV for C++, version 1.3.0.1
CSV for C++, version 1.3.1
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down
33 changes: 11 additions & 22 deletions include/internal/csv_reader_internals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ namespace csv {
text_buffer.reserve(data.in.size());
split_buffer.reserve(data.in.size() / 10);

const size_t in_size = in.size();
for (size_t i = 0; i < in_size; i++) {
for (size_t i = 0; i < in.size(); i++) {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
Expand All @@ -28,7 +27,7 @@ namespace csv {
case ParseFlags::NEWLINE:
if (!data.quote_escape) {
// End of record -> Write record
if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
++i;

data.records.push_back(CSVRow(data.row_buffer));
Expand All @@ -41,25 +40,15 @@ namespace csv {
case ParseFlags::NOT_SPECIAL: {
size_t start, end;

// Trim off leading whitespace
while (i < in_size && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in_size
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
if (!parse_not_special(
in,
parse_flags,
ws_flags,
i,
start,
end
)) {
break;
}

// Finally append text
Expand Down
43 changes: 43 additions & 0 deletions include/internal/csv_reader_internals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,49 @@ namespace csv {
return ret;
}

/** Parse a CSV field until a delimiter is hit
* @return A value indicating whether or not text to be
* saved to the text buffer
*/
CONSTEXPR bool parse_not_special(
csv::string_view in,
const csv::internals::ParseFlags* const parse_flags,
const bool* const ws_flags,
size_t& i,
size_t& start,
size_t& end) {
// Trim off leading whitespace
while (i < in.size() && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Case: This field is entirely whitespace
if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
// Back the parser up one character so switch statement
// can process the delimiter or newline
i--;
return false;
}

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in.size()
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
}

return true;
}

struct ParseData {
csv::string_view in;
ParseFlagMap parse_flags;
Expand Down
80 changes: 56 additions & 24 deletions single_include/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
/*
CSV for C++, version 1.3.0.1
CSV for C++, version 1.3.1
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down Expand Up @@ -4176,6 +4176,49 @@ namespace csv {
return ret;
}

/** Parse a CSV field until a delimiter is hit
* @return A value indicating whether or not text to be
* saved to the text buffer
*/
CONSTEXPR bool parse_not_special(
csv::string_view in,
const csv::internals::ParseFlags* const parse_flags,
const bool* const ws_flags,
size_t& i,
size_t& start,
size_t& end) {
// Trim off leading whitespace
while (i < in.size() && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Case: This field is entirely whitespace
if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
// Back the parser up one character so switch statement
// can process the delimiter or newline
i--;
return false;
}

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in.size()
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
}

return true;
}

struct ParseData {
csv::string_view in;
ParseFlagMap parse_flags;
Expand Down Expand Up @@ -5003,8 +5046,7 @@ namespace csv {
text_buffer.reserve(data.in.size());
split_buffer.reserve(data.in.size() / 10);

const size_t in_size = in.size();
for (size_t i = 0; i < in_size; i++) {
for (size_t i = 0; i < in.size(); i++) {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
Expand All @@ -5016,7 +5058,7 @@ namespace csv {
case ParseFlags::NEWLINE:
if (!data.quote_escape) {
// End of record -> Write record
if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
++i;

data.records.push_back(CSVRow(data.row_buffer));
Expand All @@ -5029,25 +5071,15 @@ namespace csv {
case ParseFlags::NOT_SPECIAL: {
size_t start, end;

// Trim off leading whitespace
while (i < in_size && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in_size
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
if (!parse_not_special(
in,
parse_flags,
ws_flags,
i,
start,
end
)) {
break;
}

// Finally append text
Expand Down Expand Up @@ -6021,4 +6053,4 @@ namespace csv {
}
}

#endif
#endif
80 changes: 56 additions & 24 deletions single_include_test/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
/*
CSV for C++, version 1.3.0.1
CSV for C++, version 1.3.1
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down Expand Up @@ -4176,6 +4176,49 @@ namespace csv {
return ret;
}

/** Parse a CSV field until a delimiter is hit
* @return A value indicating whether or not text to be
* saved to the text buffer
*/
CONSTEXPR bool parse_not_special(
csv::string_view in,
const csv::internals::ParseFlags* const parse_flags,
const bool* const ws_flags,
size_t& i,
size_t& start,
size_t& end) {
// Trim off leading whitespace
while (i < in.size() && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Case: This field is entirely whitespace
if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
// Back the parser up one character so switch statement
// can process the delimiter or newline
i--;
return false;
}

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in.size()
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
}

return true;
}

struct ParseData {
csv::string_view in;
ParseFlagMap parse_flags;
Expand Down Expand Up @@ -5003,8 +5046,7 @@ namespace csv {
text_buffer.reserve(data.in.size());
split_buffer.reserve(data.in.size() / 10);

const size_t in_size = in.size();
for (size_t i = 0; i < in_size; i++) {
for (size_t i = 0; i < in.size(); i++) {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
Expand All @@ -5016,7 +5058,7 @@ namespace csv {
case ParseFlags::NEWLINE:
if (!data.quote_escape) {
// End of record -> Write record
if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
++i;

data.records.push_back(CSVRow(data.row_buffer));
Expand All @@ -5029,25 +5071,15 @@ namespace csv {
case ParseFlags::NOT_SPECIAL: {
size_t start, end;

// Trim off leading whitespace
while (i < in_size && ws_flags[in[i] + 128]) {
i++;
}

start = i;

// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
while (i + 1 < in_size
&& parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
i++;
}

// Trim off trailing whitespace
end = i;
while (ws_flags[in[end] + 128]) {
end--;
if (!parse_not_special(
in,
parse_flags,
ws_flags,
i,
start,
end
)) {
break;
}

// Finally append text
Expand Down Expand Up @@ -6021,4 +6053,4 @@ namespace csv {
}
}

#endif
#endif
Loading

0 comments on commit 4ccef57

Please sign in to comment.