Added patch for whitespace trimming (#94)

* Fix issue with whitespace parsing * Update Catch + added more test cases * Update test_read_csv.cpp * Update CMake settings * Update version + single header files * Update README.md
vincentlaucsb · May 4, 2020 · 4ccef57 · 4ccef57
1 parent 351b646
commit 4ccef57
Show file tree

Hide file tree

Showing 9 changed files with 4,111 additions and 1,240 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -51,6 +51,11 @@ add_subdirectory("programs")
 
 ## Developer settings
 if (CSV_DEVELOPER)
+    # Allow for performance profiling
+    if (MSVC)
+	    target_link_options(csv PUBLIC /PROFILE)
+    endif()
+
     # Generate a single header library
     find_package(Python3 QUIET)
     if(Python3_Interpreter_FOUND)

diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ do not hesitate to report it.
 
 ## Documentation
 
-In addition to the [Features & Examples](#features--examples) below, a [fully-fledged online documentation](http://vincela.com/csv) contains more examples, details, interesting features, and instructions for less common use cases.
+In addition to the [Features & Examples](#features--examples) below, a [fully-fledged online documentation](https://vincentlaucsb.github.io/csv-parser/html/) contains more examples, details, interesting features, and instructions for less common use cases.
 
 ## Integration
 

diff --git a/include/csv.hpp b/include/csv.hpp
@@ -1,5 +1,5 @@
 /*
-CSV for C++, version 1.3.0.1
+CSV for C++, version 1.3.1
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License

diff --git a/include/internal/csv_reader_internals.cpp b/include/internal/csv_reader_internals.cpp
@@ -15,8 +15,7 @@ namespace csv {
             text_buffer.reserve(data.in.size());
             split_buffer.reserve(data.in.size() / 10);
 
-            const size_t in_size = in.size();
-            for (size_t i = 0; i < in_size; i++) {
+            for (size_t i = 0; i < in.size(); i++) {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
@@ -28,7 +27,7 @@ namespace csv {
                 case ParseFlags::NEWLINE:
                     if (!data.quote_escape) {
                         // End of record -> Write record
-                        if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
+                        if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
                             ++i;
 
                         data.records.push_back(CSVRow(data.row_buffer));
@@ -41,25 +40,15 @@ namespace csv {
                 case ParseFlags::NOT_SPECIAL: {
                     size_t start, end;
 
-                    // Trim off leading whitespace
-                    while (i < in_size && ws_flags[in[i] + 128]) {
-                        i++;
-                    }
-
-                    start = i;
-
-                    // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
-                    // sequences, use the loop below to avoid having to go through the outer
-                    // switch statement as much as possible
-                    while (i + 1 < in_size
-                        && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
-                        i++;
-                    }
-
-                    // Trim off trailing whitespace
-                    end = i;
-                    while (ws_flags[in[end] + 128]) {
-                        end--;
+                    if (!parse_not_special(
+                        in,
+                        parse_flags,
+                        ws_flags,
+                        i,
+                        start,
+                        end
+                    )) {
+                        break;
                     }
 
                     // Finally append text

diff --git a/include/internal/csv_reader_internals.hpp b/include/internal/csv_reader_internals.hpp
@@ -70,6 +70,49 @@ namespace csv {
             return ret;
         }
 
+        /** Parse a CSV field until a delimiter is hit
+         *  @return A value indicating whether or not text to be
+         *          saved to the text buffer
+         */
+        CONSTEXPR bool parse_not_special(
+            csv::string_view in,
+            const csv::internals::ParseFlags* const parse_flags,
+            const bool* const ws_flags,
+            size_t& i,
+            size_t& start,
+            size_t& end) {
+            // Trim off leading whitespace
+            while (i < in.size() && ws_flags[in[i] + 128]) {
+                i++;
+            }
+
+            start = i;
+
+            // Case: This field is entirely whitespace
+            if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
+                // Back the parser up one character so switch statement
+                // can process the delimiter or newline
+                i--;
+                return false;
+            }
+
+            // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
+            // sequences, use the loop below to avoid having to go through the outer
+            // switch statement as much as possible
+            while (i + 1 < in.size()
+                && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
+                i++;
+            }
+
+            // Trim off trailing whitespace
+            end = i;
+            while (ws_flags[in[end] + 128]) {
+                end--;
+            }
+
+            return true;
+        }
+
         struct ParseData {
             csv::string_view in;
             ParseFlagMap parse_flags;

diff --git a/single_include/csv.hpp b/single_include/csv.hpp
@@ -1,6 +1,6 @@
 #pragma once
 /*
-CSV for C++, version 1.3.0.1
+CSV for C++, version 1.3.1
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License
@@ -4176,6 +4176,49 @@ namespace csv {
             return ret;
         }
 
+        /** Parse a CSV field until a delimiter is hit
+         *  @return A value indicating whether or not text to be
+         *          saved to the text buffer
+         */
+        CONSTEXPR bool parse_not_special(
+            csv::string_view in,
+            const csv::internals::ParseFlags* const parse_flags,
+            const bool* const ws_flags,
+            size_t& i,
+            size_t& start,
+            size_t& end) {
+            // Trim off leading whitespace
+            while (i < in.size() && ws_flags[in[i] + 128]) {
+                i++;
+            }
+
+            start = i;
+
+            // Case: This field is entirely whitespace
+            if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
+                // Back the parser up one character so switch statement
+                // can process the delimiter or newline
+                i--;
+                return false;
+            }
+
+            // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
+            // sequences, use the loop below to avoid having to go through the outer
+            // switch statement as much as possible
+            while (i + 1 < in.size()
+                && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
+                i++;
+            }
+
+            // Trim off trailing whitespace
+            end = i;
+            while (ws_flags[in[end] + 128]) {
+                end--;
+            }
+
+            return true;
+        }
+
         struct ParseData {
             csv::string_view in;
             ParseFlagMap parse_flags;
@@ -5003,8 +5046,7 @@ namespace csv {
             text_buffer.reserve(data.in.size());
             split_buffer.reserve(data.in.size() / 10);
 
-            const size_t in_size = in.size();
-            for (size_t i = 0; i < in_size; i++) {
+            for (size_t i = 0; i < in.size(); i++) {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
@@ -5016,7 +5058,7 @@ namespace csv {
                 case ParseFlags::NEWLINE:
                     if (!data.quote_escape) {
                         // End of record -> Write record
-                        if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
+                        if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
                             ++i;
 
                         data.records.push_back(CSVRow(data.row_buffer));
@@ -5029,25 +5071,15 @@ namespace csv {
                 case ParseFlags::NOT_SPECIAL: {
                     size_t start, end;
 
-                    // Trim off leading whitespace
-                    while (i < in_size && ws_flags[in[i] + 128]) {
-                        i++;
-                    }
-
-                    start = i;
-
-                    // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
-                    // sequences, use the loop below to avoid having to go through the outer
-                    // switch statement as much as possible
-                    while (i + 1 < in_size
-                        && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
-                        i++;
-                    }
-
-                    // Trim off trailing whitespace
-                    end = i;
-                    while (ws_flags[in[end] + 128]) {
-                        end--;
+                    if (!parse_not_special(
+                        in,
+                        parse_flags,
+                        ws_flags,
+                        i,
+                        start,
+                        end
+                    )) {
+                        break;
                     }
 
                     // Finally append text
@@ -6021,4 +6053,4 @@ namespace csv {
     }
 }
 
-#endif
+#endif
diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp
@@ -1,6 +1,6 @@
 #pragma once
 /*
-CSV for C++, version 1.3.0.1
+CSV for C++, version 1.3.1
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License
@@ -4176,6 +4176,49 @@ namespace csv {
             return ret;
         }
 
+        /** Parse a CSV field until a delimiter is hit
+         *  @return A value indicating whether or not text to be
+         *          saved to the text buffer
+         */
+        CONSTEXPR bool parse_not_special(
+            csv::string_view in,
+            const csv::internals::ParseFlags* const parse_flags,
+            const bool* const ws_flags,
+            size_t& i,
+            size_t& start,
+            size_t& end) {
+            // Trim off leading whitespace
+            while (i < in.size() && ws_flags[in[i] + 128]) {
+                i++;
+            }
+
+            start = i;
+
+            // Case: This field is entirely whitespace
+            if (parse_flags[in[start] + 128] >= ParseFlags::DELIMITER) {
+                // Back the parser up one character so switch statement
+                // can process the delimiter or newline
+                i--;
+                return false;
+            }
+
+            // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
+            // sequences, use the loop below to avoid having to go through the outer
+            // switch statement as much as possible
+            while (i + 1 < in.size()
+                && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
+                i++;
+            }
+
+            // Trim off trailing whitespace
+            end = i;
+            while (ws_flags[in[end] + 128]) {
+                end--;
+            }
+
+            return true;
+        }
+
         struct ParseData {
             csv::string_view in;
             ParseFlagMap parse_flags;
@@ -5003,8 +5046,7 @@ namespace csv {
             text_buffer.reserve(data.in.size());
             split_buffer.reserve(data.in.size() / 10);
 
-            const size_t in_size = in.size();
-            for (size_t i = 0; i < in_size; i++) {
+            for (size_t i = 0; i < in.size(); i++) {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
@@ -5016,7 +5058,7 @@ namespace csv {
                 case ParseFlags::NEWLINE:
                     if (!data.quote_escape) {
                         // End of record -> Write record
-                        if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
+                        if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
                             ++i;
 
                         data.records.push_back(CSVRow(data.row_buffer));
@@ -5029,25 +5071,15 @@ namespace csv {
                 case ParseFlags::NOT_SPECIAL: {
                     size_t start, end;
 
-                    // Trim off leading whitespace
-                    while (i < in_size && ws_flags[in[i] + 128]) {
-                        i++;
-                    }
-
-                    start = i;
-
-                    // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
-                    // sequences, use the loop below to avoid having to go through the outer
-                    // switch statement as much as possible
-                    while (i + 1 < in_size
-                        && parse_flags[in[i + 1] + 128] == ParseFlags::NOT_SPECIAL) {
-                        i++;
-                    }
-
-                    // Trim off trailing whitespace
-                    end = i;
-                    while (ws_flags[in[end] + 128]) {
-                        end--;
+                    if (!parse_not_special(
+                        in,
+                        parse_flags,
+                        ws_flags,
+                        i,
+                        start,
+                        end
+                    )) {
+                        break;
                     }
 
                     // Finally append text
@@ -6021,4 +6053,4 @@ namespace csv {
     }
 }
 
-#endif
+#endif