Skip to content

Commit

Permalink
Fix issue parsing very long rows (#95)
Browse files Browse the repository at this point in the history
* Added some scratch work

* Added tests of long CSV rows

* Update single header files
  • Loading branch information
vincentlaucsb committed May 9, 2020
1 parent 4ccef57 commit d490180
Show file tree
Hide file tree
Showing 11 changed files with 87 additions and 29 deletions.
2 changes: 1 addition & 1 deletion include/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
CSV for C++, version 1.3.1
CSV for C++, version 1.3.2
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down
2 changes: 1 addition & 1 deletion include/internal/csv_reader_internals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace csv {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
split_buffer.push_back((unsigned short)row_buffer.size());
split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
break;
}

Expand Down
2 changes: 1 addition & 1 deletion include/internal/csv_row.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ namespace csv {
CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};

/** Constructor for testing */
CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
const std::shared_ptr<internals::ColNames>& col_names)
: CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};

Expand Down
6 changes: 4 additions & 2 deletions include/internal/data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ namespace csv {
long double exponent = 0;
auto result = data_type(exponential_part, &exponent);

if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
// Exponents in scientific notation should not be decimal numbers
if (result >= CSV_INT8 && result < CSV_DOUBLE) {
if (out) *out = coeff * pow10(exponent);
return CSV_DOUBLE;
}
Expand Down Expand Up @@ -285,8 +286,9 @@ namespace csv {
case 'e':
case 'E':
// Process scientific notation
if (prob_float) {
if (prob_float || isdigit(in[i - 1])) {
size_t exponent_start_idx = i + 1;
prob_float = true;

// Strip out plus sign
if (in[i + 1] == '+') {
Expand Down
4 changes: 2 additions & 2 deletions include/internal/row_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ namespace csv {
{
const size_t head_idx = this->current_split_idx,
new_split_idx = this->split_buffer.size();
unsigned short n_cols = (new_split_idx - head_idx > 0) ?
(unsigned short)(new_split_idx - head_idx + 1): 0;
StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
(StrBufferPos)(new_split_idx - head_idx + 1): 0;

this->current_split_idx = new_split_idx;
return ColumnPositions(head_idx, n_cols);
Expand Down
4 changes: 2 additions & 2 deletions include/internal/row_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace csv {
struct ColNames;
using BufferPtr = std::shared_ptr<RawRowBuffer>;
using ColNamesPtr = std::shared_ptr<ColNames>;
using StrBufferPos = unsigned short;
using StrBufferPos = size_t;
using SplitArray = std::vector<StrBufferPos>;

/** @struct ColNames
Expand Down Expand Up @@ -110,7 +110,7 @@ namespace csv {

struct ColumnPositions {
ColumnPositions() = default;
constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
size_t start; /**< Where in split_buffer the array of column positions begins */
size_t n_cols; /**< Number of columns */
};
Expand Down
21 changes: 12 additions & 9 deletions single_include/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
/*
CSV for C++, version 1.3.1
CSV for C++, version 1.3.2
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down Expand Up @@ -29,6 +29,7 @@ SOFTWARE.
#ifndef CSV_HPP
#define CSV_HPP


// Copyright 2017-2019 by Martin Moene
//
// string-view lite, a C++17-like string_view for C++98 and later.
Expand Down Expand Up @@ -3427,7 +3428,8 @@ namespace csv {
long double exponent = 0;
auto result = data_type(exponential_part, &exponent);

if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
// Exponents in scientific notation should not be decimal numbers
if (result >= CSV_INT8 && result < CSV_DOUBLE) {
if (out) *out = coeff * pow10(exponent);
return CSV_DOUBLE;
}
Expand Down Expand Up @@ -3518,8 +3520,9 @@ namespace csv {
case 'e':
case 'E':
// Process scientific notation
if (prob_float) {
if (prob_float || isdigit(in[i - 1])) {
size_t exponent_start_idx = i + 1;
prob_float = true;

// Strip out plus sign
if (in[i + 1] == '+') {
Expand Down Expand Up @@ -3677,7 +3680,7 @@ namespace csv {
struct ColNames;
using BufferPtr = std::shared_ptr<RawRowBuffer>;
using ColNamesPtr = std::shared_ptr<ColNames>;
using StrBufferPos = unsigned short;
using StrBufferPos = size_t;
using SplitArray = std::vector<StrBufferPos>;

/** @struct ColNames
Expand Down Expand Up @@ -3767,7 +3770,7 @@ namespace csv {

struct ColumnPositions {
ColumnPositions() = default;
constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
size_t start; /**< Where in split_buffer the array of column positions begins */
size_t n_cols; /**< Number of columns */
};
Expand Down Expand Up @@ -3967,7 +3970,7 @@ namespace csv {
CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};

/** Constructor for testing */
CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
const std::shared_ptr<internals::ColNames>& col_names)
: CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};

Expand Down Expand Up @@ -5050,7 +5053,7 @@ namespace csv {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
split_buffer.push_back((unsigned short)row_buffer.size());
split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
break;
}

Expand Down Expand Up @@ -6013,8 +6016,8 @@ namespace csv {
{
const size_t head_idx = this->current_split_idx,
new_split_idx = this->split_buffer.size();
unsigned short n_cols = (new_split_idx - head_idx > 0) ?
(unsigned short)(new_split_idx - head_idx + 1): 0;
StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
(StrBufferPos)(new_split_idx - head_idx + 1): 0;

this->current_split_idx = new_split_idx;
return ColumnPositions(head_idx, n_cols);
Expand Down
21 changes: 12 additions & 9 deletions single_include_test/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
/*
CSV for C++, version 1.3.1
CSV for C++, version 1.3.2
https://github.com/vincentlaucsb/csv-parser
MIT License
Expand Down Expand Up @@ -29,6 +29,7 @@ SOFTWARE.
#ifndef CSV_HPP
#define CSV_HPP


// Copyright 2017-2019 by Martin Moene
//
// string-view lite, a C++17-like string_view for C++98 and later.
Expand Down Expand Up @@ -3427,7 +3428,8 @@ namespace csv {
long double exponent = 0;
auto result = data_type(exponential_part, &exponent);

if (result >= CSV_INT8 && result <= CSV_DOUBLE) {
// Exponents in scientific notation should not be decimal numbers
if (result >= CSV_INT8 && result < CSV_DOUBLE) {
if (out) *out = coeff * pow10(exponent);
return CSV_DOUBLE;
}
Expand Down Expand Up @@ -3518,8 +3520,9 @@ namespace csv {
case 'e':
case 'E':
// Process scientific notation
if (prob_float) {
if (prob_float || isdigit(in[i - 1])) {
size_t exponent_start_idx = i + 1;
prob_float = true;

// Strip out plus sign
if (in[i + 1] == '+') {
Expand Down Expand Up @@ -3677,7 +3680,7 @@ namespace csv {
struct ColNames;
using BufferPtr = std::shared_ptr<RawRowBuffer>;
using ColNamesPtr = std::shared_ptr<ColNames>;
using StrBufferPos = unsigned short;
using StrBufferPos = size_t;
using SplitArray = std::vector<StrBufferPos>;

/** @struct ColNames
Expand Down Expand Up @@ -3767,7 +3770,7 @@ namespace csv {

struct ColumnPositions {
ColumnPositions() = default;
constexpr ColumnPositions(size_t _start, unsigned short _size) : start(_start), n_cols(_size) {};
constexpr ColumnPositions(size_t _start, StrBufferPos _size) : start(_start), n_cols(_size) {};
size_t start; /**< Where in split_buffer the array of column positions begins */
size_t n_cols; /**< Number of columns */
};
Expand Down Expand Up @@ -3967,7 +3970,7 @@ namespace csv {
CSVRow(const internals::BufferPtr& _buffer) : buffer(_buffer), data(_buffer->get_row()) {};

/** Constructor for testing */
CSVRow(const std::string& str, const std::vector<unsigned short>& splits,
CSVRow(const std::string& str, const std::vector<internals::StrBufferPos>& splits,
const std::shared_ptr<internals::ColNames>& col_names)
: CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {};

Expand Down Expand Up @@ -5050,7 +5053,7 @@ namespace csv {
switch (parse_flags[data.in[i] + 128]) {
case ParseFlags::DELIMITER:
if (!data.quote_escape) {
split_buffer.push_back((unsigned short)row_buffer.size());
split_buffer.push_back((internals::StrBufferPos)row_buffer.size());
break;
}

Expand Down Expand Up @@ -6013,8 +6016,8 @@ namespace csv {
{
const size_t head_idx = this->current_split_idx,
new_split_idx = this->split_buffer.size();
unsigned short n_cols = (new_split_idx - head_idx > 0) ?
(unsigned short)(new_split_idx - head_idx + 1): 0;
StrBufferPos n_cols = (new_split_idx - head_idx > 0) ?
(StrBufferPos)(new_split_idx - head_idx + 1): 0;

this->current_split_idx = new_split_idx;
return ColumnPositions(head_idx, n_cols);
Expand Down
4 changes: 2 additions & 2 deletions tests/test_csv_row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ TEST_CASE("CSVRow Test", "[test_csv_row]") {
"Col3"
"Col4";

std::vector<unsigned short> splits = { 4, 8, 12 };
std::vector<internals::StrBufferPos> splits = { 4, 8, 12 };

const CSVRow row(str, splits, col_names);

Expand Down Expand Up @@ -90,7 +90,7 @@ TEST_CASE("CSVField operator==", "[test_csv_field_equal]") {
"3"
"3.14";

std::vector<unsigned short> splits = { 1, 2, 3 };
std::vector<internals::StrBufferPos> splits = { 1, 2, 3 };
CSVRow row(str, splits, col_names);

REQUIRE(row["A"] == 1);
Expand Down
6 changes: 6 additions & 0 deletions tests/test_data_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ TEST_CASE("Parse Scientific Notation", "[e_notation]") {
// Test parsing e notation
long double out;

REQUIRE(data_type("1E-06", &out) == CSV_DOUBLE);
REQUIRE(is_equal(out, 0.000001L));

REQUIRE(data_type("1e-06", &out) == CSV_DOUBLE);
REQUIRE(is_equal(out, 0.000001L));

REQUIRE(data_type("2.17222E+02", &out) == CSV_DOUBLE);
REQUIRE(is_equal(out, 217.222L));

Expand Down
44 changes: 44 additions & 0 deletions tests/test_read_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,4 +375,48 @@ timestamp,distance,angle,amplitude

// Original issue: Leading comments appeared in column names
REQUIRE(expected == reader.get_col_names());
}

// Reported in: https://github.com/vincentlaucsb/csv-parser/issues/92
TEST_CASE("Long Row Test", "[long_row_regression]") {
std::stringstream csv_string;
constexpr int n_cols = 100000;

// Make header row
for (int i = 0; i < n_cols; i++) {
csv_string << i;
if (i + 1 == n_cols) {
csv_string << std::endl;
}
else {
csv_string << ',';
}
}

// Make data row
for (int i = 0; i < n_cols; i++) {
csv_string << (double)i * 0.000001;
if (i + 1 == n_cols) {
csv_string << std::endl;
}
else {
csv_string << ',';
}
}

auto rows = parse(csv_string.str());
REQUIRE(rows.get_col_names().size() == n_cols);

CSVRow row;
rows.read_row(row);

int i = 0;

// Make sure all CSV fields are correct
for (auto& field : row) {
std::stringstream temp;
temp << (double)i * 0.000001;
REQUIRE(field.get<>() == temp.str());
i++;
}
}

0 comments on commit d490180

Please sign in to comment.