Fix issue parsing very long rows (#95)

* Added some scratch work * Added tests of long CSV rows * Update single header files
vincentlaucsb · May 9, 2020 · d490180 · d490180
1 parent 4ccef57
commit d490180
Show file tree

Hide file tree

Showing 11 changed files with 87 additions and 29 deletions.
diff --git a/include/csv.hpp b/include/csv.hpp
@@ -1,5 +1,5 @@
 /*
-CSV for C++, version 1.3.1
+CSV for C++, version 1.3.2
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License

diff --git a/include/internal/csv_reader_internals.cpp b/include/internal/csv_reader_internals.cpp
@@ -19,7 +19,7 @@ namespace csv {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
-                        split_buffer.push_back((unsigned short)row_buffer.size());
+                        split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
                         break;
                     }
 

diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp
@@ -190,7 +190,7 @@ namespace csv {
         CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};
 
         /** Constructor for testing */
-        CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
+        CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
             const std::shared_ptr<internals::ColNames>& col_names)
             : CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};
 

diff --git a/include/internal/data_type.h b/include/internal/data_type.h
@@ -194,7 +194,8 @@ namespace csv {
             long double exponent = 0;
             auto result = data_type(exponential_part, &exponent);
 
-            if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
+            // Exponents in scientific notation should not be decimal numbers
+            if (result >= CSV_INT8 && result < CSV_DOUBLE) {
                 if (out) *out = coeff * pow10(exponent);
                 return CSV_DOUBLE;
             }
@@ -285,8 +286,9 @@ namespace csv {
                 case 'e':
                 case 'E':
                     // Process scientific notation
-                    if (prob_float) {
+                    if (prob_float || isdigit(in[i - 1])) {
                         size_t exponent_start_idx = i + 1;
+                        prob_float = true;
 
                         // Strip out plus sign
                         if (in[i + 1] == '+') {

diff --git a/include/internal/row_buffer.cpp b/include/internal/row_buffer.cpp
@@ -58,8 +58,8 @@ namespace csv {
         {
             const size_t head_idx = this->current_split_idx,
                 new_split_idx = this->split_buffer.size();
-            unsigned short n_cols = (new_split_idx - head_idx > 0) ?
-                (unsigned short)(new_split_idx - head_idx + 1): 0;
+            StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
+                (StrBufferPos)(new_split_idx - head_idx + 1): 0;
 
             this->current_split_idx = new_split_idx;
             return ColumnPositions(head_idx, n_cols);

diff --git a/include/internal/row_buffer.hpp b/include/internal/row_buffer.hpp
@@ -20,7 +20,7 @@ namespace csv {
         struct ColNames;
         using BufferPtr = std::shared_ptr<RawRowBuffer>;
         using ColNamesPtr = std::shared_ptr<ColNames>;
-        using StrBufferPos = unsigned short;
+        using StrBufferPos = size_t;
         using SplitArray = std::vector<StrBufferPos>;
 
         /** @struct ColNames
@@ -110,7 +110,7 @@ namespace csv {
 
         struct ColumnPositions {
             ColumnPositions() = default;
-            constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
+            constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
             size_t start;                /**< Where in split_buffer the array of column positions begins */
             size_t n_cols;               /**< Number of columns */
         };

diff --git a/single_include/csv.hpp b/single_include/csv.hpp
@@ -1,6 +1,6 @@
 #pragma once
 /*
-CSV for C++, version 1.3.1
+CSV for C++, version 1.3.2
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License
@@ -29,6 +29,7 @@ SOFTWARE.
 #ifndef CSV_HPP
 #define CSV_HPP
 
+
 // Copyright 2017-2019 by Martin Moene
 //
 // string-view lite, a C++17-like string_view for C++98 and later.
@@ -3427,7 +3428,8 @@ namespace csv {
             long double exponent = 0;
             auto result = data_type(exponential_part, &exponent);
 
-            if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
+            // Exponents in scientific notation should not be decimal numbers
+            if (result >= CSV_INT8 && result < CSV_DOUBLE) {
                 if (out) *out = coeff * pow10(exponent);
                 return CSV_DOUBLE;
             }
@@ -3518,8 +3520,9 @@ namespace csv {
                 case 'e':
                 case 'E':
                     // Process scientific notation
-                    if (prob_float) {
+                    if (prob_float || isdigit(in[i - 1])) {
                         size_t exponent_start_idx = i + 1;
+                        prob_float = true;
 
                         // Strip out plus sign
                         if (in[i + 1] == '+') {
@@ -3677,7 +3680,7 @@ namespace csv {
         struct ColNames;
         using BufferPtr = std::shared_ptr<RawRowBuffer>;
         using ColNamesPtr = std::shared_ptr<ColNames>;
-        using StrBufferPos = unsigned short;
+        using StrBufferPos = size_t;
         using SplitArray = std::vector<StrBufferPos>;
 
         /** @struct ColNames
@@ -3767,7 +3770,7 @@ namespace csv {
 
         struct ColumnPositions {
             ColumnPositions() = default;
-            constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
+            constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
             size_t start;                /**< Where in split_buffer the array of column positions begins */
             size_t n_cols;               /**< Number of columns */
         };
@@ -3967,7 +3970,7 @@ namespace csv {
         CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};
 
         /** Constructor for testing */
-        CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
+        CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
             const std::shared_ptr<internals::ColNames>& col_names)
             : CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};
 
@@ -5050,7 +5053,7 @@ namespace csv {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
-                        split_buffer.push_back((unsigned short)row_buffer.size());
+                        split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
                         break;
                     }
 
@@ -6013,8 +6016,8 @@ namespace csv {
         {
             const size_t head_idx = this->current_split_idx,
                 new_split_idx = this->split_buffer.size();
-            unsigned short n_cols = (new_split_idx - head_idx > 0) ?
-                (unsigned short)(new_split_idx - head_idx + 1): 0;
+            StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
+                (StrBufferPos)(new_split_idx - head_idx + 1): 0;
 
             this->current_split_idx = new_split_idx;
             return ColumnPositions(head_idx, n_cols);

diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp
@@ -1,6 +1,6 @@
 #pragma once
 /*
-CSV for C++, version 1.3.1
+CSV for C++, version 1.3.2
 https://github.com/vincentlaucsb/csv-parser
 
 MIT License
@@ -29,6 +29,7 @@ SOFTWARE.
 #ifndef CSV_HPP
 #define CSV_HPP
 
+
 // Copyright 2017-2019 by Martin Moene
 //
 // string-view lite, a C++17-like string_view for C++98 and later.
@@ -3427,7 +3428,8 @@ namespace csv {
             long double exponent = 0;
             auto result = data_type(exponential_part, &exponent);
 
-            if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
+            // Exponents in scientific notation should not be decimal numbers
+            if (result >= CSV_INT8 && result < CSV_DOUBLE) {
                 if (out) *out = coeff * pow10(exponent);
                 return CSV_DOUBLE;
             }
@@ -3518,8 +3520,9 @@ namespace csv {
                 case 'e':
                 case 'E':
                     // Process scientific notation
-                    if (prob_float) {
+                    if (prob_float || isdigit(in[i - 1])) {
                         size_t exponent_start_idx = i + 1;
+                        prob_float = true;
 
                         // Strip out plus sign
                         if (in[i + 1] == '+') {
@@ -3677,7 +3680,7 @@ namespace csv {
         struct ColNames;
         using BufferPtr = std::shared_ptr<RawRowBuffer>;
         using ColNamesPtr = std::shared_ptr<ColNames>;
-        using StrBufferPos = unsigned short;
+        using StrBufferPos = size_t;
         using SplitArray = std::vector<StrBufferPos>;
 
         /** @struct ColNames
@@ -3767,7 +3770,7 @@ namespace csv {
 
         struct ColumnPositions {
             ColumnPositions() = default;
-            constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
+            constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
             size_t start;                /**< Where in split_buffer the array of column positions begins */
             size_t n_cols;               /**< Number of columns */
         };
@@ -3967,7 +3970,7 @@ namespace csv {
         CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};
 
         /** Constructor for testing */
-        CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
+        CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
             const std::shared_ptr<internals::ColNames>& col_names)
             : CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};
 
@@ -5050,7 +5053,7 @@ namespace csv {
                 switch (parse_flags[data.in[i] + 128]) {
                 case ParseFlags::DELIMITER:
                     if (!data.quote_escape) {
-                        split_buffer.push_back((unsigned short)row_buffer.size());
+                        split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
                         break;
                     }
 
@@ -6013,8 +6016,8 @@ namespace csv {
         {
             const size_t head_idx = this->current_split_idx,
                 new_split_idx = this->split_buffer.size();
-            unsigned short n_cols = (new_split_idx - head_idx > 0) ?
-                (unsigned short)(new_split_idx - head_idx + 1): 0;
+            StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
+                (StrBufferPos)(new_split_idx - head_idx + 1): 0;
 
             this->current_split_idx = new_split_idx;
             return ColumnPositions(head_idx, n_cols);

diff --git a/tests/test_csv_row.cpp b/tests/test_csv_row.cpp
@@ -16,7 +16,7 @@ TEST_CASE("CSVRow Test", "[test_csv_row]") {
         "Col3"
         "Col4";
 
-    std::vector<unsigned short> splits = { 4, 8, 12 };
+    std::vector<internals::StrBufferPos> splits = { 4, 8, 12 };
 
     const CSVRow row(str, splits, col_names);
 
@@ -90,7 +90,7 @@ TEST_CASE("CSVField operator==", "[test_csv_field_equal]") {
         "3"
         "3.14";
 
-    std::vector<unsigned short> splits = { 1, 2, 3 };
+    std::vector<internals::StrBufferPos> splits = { 1, 2, 3 };
     CSVRow row(str, splits, col_names);
 
     REQUIRE(row["A"] == 1);

diff --git a/tests/test_data_type.cpp b/tests/test_data_type.cpp
@@ -109,6 +109,12 @@ TEST_CASE("Parse Scientific Notation", "[e_notation]") {
     // Test parsing e notation
     long double out;
 
+    REQUIRE(data_type("1E-06", &out) == CSV_DOUBLE);
+    REQUIRE(is_equal(out, 0.000001L));
+
+    REQUIRE(data_type("1e-06", &out) == CSV_DOUBLE);
+    REQUIRE(is_equal(out, 0.000001L));
+
     REQUIRE(data_type("2.17222E+02", &out) == CSV_DOUBLE);
     REQUIRE(is_equal(out, 217.222L));
 

diff --git a/tests/test_read_csv.cpp b/tests/test_read_csv.cpp
@@ -375,4 +375,48 @@ timestamp,distance,angle,amplitude
 
     // Original issue: Leading comments appeared in column names
     REQUIRE(expected == reader.get_col_names());
+}   
+
+// Reported in: https://github.com/vincentlaucsb/csv-parser/issues/92
+TEST_CASE("Long Row Test", "[long_row_regression]") {
+    std::stringstream csv_string;
+    constexpr int n_cols = 100000;
+
+    // Make header row
+    for (int i = 0; i < n_cols; i++) {
+        csv_string << i;
+        if (i + 1 == n_cols) {
+            csv_string << std::endl;
+        }
+        else {
+            csv_string << ',';
+        }
+    }
+
+    // Make data row
+    for (int i = 0; i < n_cols; i++) {
+        csv_string << (double)i * 0.000001;
+        if (i + 1 == n_cols) {
+            csv_string << std::endl;
+        }
+        else {
+            csv_string << ',';
+        }
+    }
+
+    auto rows = parse(csv_string.str());
+    REQUIRE(rows.get_col_names().size() == n_cols);
+
+    CSVRow row;
+    rows.read_row(row);
+
+    int i = 0;
+
+    // Make sure all CSV fields are correct
+    for (auto& field : row) {
+        std::stringstream temp;
+        temp << (double)i * 0.000001;
+        REQUIRE(field.get<>() == temp.str());
+        i++;
+    }
 }