Skip to content

Commit

Permalink
Fix "Bug: Delimiter guessing overwrites column names." (#46)
Browse files Browse the repository at this point in the history
* Created CSVGuessResult data type

* Added test to verify fix works

* Update test_read_csv.cpp

* Added more robust test case
  • Loading branch information
vincentlaucsb authored Aug 18, 2019
1 parent 0c1bcee commit a06a03b
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 22 deletions.
6 changes: 6 additions & 0 deletions include/internal/csv_format.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
namespace csv {
class CSVReader;

/** Stores the inferred format of a CSV file. */
struct CSVGuessResult {
char delim;
int header_row;
};

/** Stores information about how to parse a CSV file.
* Can be used to construct a csv::CSVReader.
*/
Expand Down
28 changes: 17 additions & 11 deletions include/internal/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace csv {
}
}

CSVFormat CSVGuesser::guess_delim() {
CSVGuessResult CSVGuesser::guess_delim() {
/** Guess the delimiter of a CSV by scanning the first 100 lines by
* First assuming that the header is on the first row
* If the first guess returns too few rows, then we move to the second
Expand All @@ -49,7 +49,7 @@ namespace csv {
CSVFormat format;
if (!first_guess()) second_guess();

return format.delimiter(this->delim).header_row(this->header_row);
return { delim, header_row };
}

bool CSVGuesser::first_guess() {
Expand Down Expand Up @@ -158,7 +158,7 @@ namespace csv {
}

/** Guess the delimiter used by a delimiter-separated values file */
CSVFormat guess_format(csv::string_view filename, const std::vector<char>& delims) {
CSVGuessResult guess_format(csv::string_view filename, const std::vector<char>& delims) {
internals::CSVGuesser guesser(filename, delims);
return guesser.guess_delim();
}
Expand Down Expand Up @@ -247,16 +247,18 @@ namespace csv {
*
*/
CSVReader::CSVReader(csv::string_view filename, CSVFormat format) {
if (format.guess_delim())
format = guess_format(filename, format.possible_delimiters);
/** Guess delimiter and header row */
if (format.guess_delim()) {
auto guess_result = guess_format(filename, format.possible_delimiters);
format.delimiter(guess_result.delim);
format.header = guess_result.header_row;
}

if (!format.col_names.empty()) {
this->set_col_names(format.col_names);
}
else {
header_row = format.header;
}

header_row = format.header;
delimiter = format.get_delim();
quote_char = format.quote_char;
strict = format.strict;
Expand All @@ -272,9 +274,13 @@ namespace csv {
CSVFormat CSVReader::get_format() const {
CSVFormat format;
format.delimiter(this->delimiter)
.quote(this->quote_char)
.header_row(this->header_row)
.column_names(this->col_names->col_names);
.quote(this->quote_char);

// Since users are normally not allowed to set
// column names and header row simulatenously,
// we will set the backing variables directly here
format.col_names = this->col_names->col_names;
format.header = this->header_row;

return format;
}
Expand Down
2 changes: 1 addition & 1 deletion include/internal/csv_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ namespace csv {
public:
CSVGuesser(csv::string_view _filename, const std::vector<char>& _delims) :
filename(_filename), delims(_delims) {};
CSVFormat guess_delim();
CSVGuessResult guess_delim();
bool first_guess();
void second_guess();

Expand Down
2 changes: 1 addition & 1 deletion include/internal/csv_utility.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace csv {
///@{
std::unordered_map<std::string, DataType> csv_data_types(const std::string&);
CSVFileInfo get_file_info(const std::string& filename);
CSVFormat guess_format(csv::string_view filename,
CSVGuessResult guess_format(csv::string_view filename,
const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
std::vector<std::string> get_col_names(
const std::string& filename,
Expand Down
42 changes: 33 additions & 9 deletions tests/test_read_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,48 @@ TEST_CASE("col_pos() Test", "[test_col_pos]") {
}

TEST_CASE("guess_delim() Test - Pipe", "[test_guess_pipe]") {
CSVFormat format = guess_format(
CSVGuessResult format = guess_format(
"./tests/data/real_data/2009PowerStatus.txt");
REQUIRE(format.get_delim() == '|');
REQUIRE(format.get_header() == 0);
REQUIRE(format.delim == '|');
REQUIRE(format.header_row == 0);
}

TEST_CASE("guess_delim() Test - Semi-Colon", "[test_guess_scolon]") {
CSVFormat format = guess_format(
CSVGuessResult format = guess_format(
"./tests/data/real_data/YEAR07_CBSA_NAC3.txt");
REQUIRE(format.get_delim() == ';');
REQUIRE(format.get_header() == 0);
REQUIRE(format.delim == ';');
REQUIRE(format.header_row == 0);
}

TEST_CASE("guess_delim() Test - CSV with Comments", "[test_guess_comment]") {
CSVFormat format = guess_format(
CSVGuessResult format = guess_format(
"./tests/data/fake_data/ints_comments.csv");
REQUIRE(format.get_delim() == ',');
REQUIRE(format.get_header() == 5);
REQUIRE(format.delim == ',');
REQUIRE(format.header_row == 5);
}

TEST_CASE("Prevent Column Names From Being Overwritten", "[csv_col_names_overwrite]") {
std::vector<std::string> column_names = { "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10" };

// Test against a variety of different CSVFormat objects
std::vector<CSVFormat> formats = {};
formats.push_back(CSVFormat::GUESS_CSV);
formats.push_back(CSVFormat());
formats.back().delimiter(std::vector<char>({ ',', '\t', '|'}));
formats.push_back(CSVFormat());
formats.back().delimiter(std::vector<char>({ ',', '~'}));

for (auto& format_in : formats) {
// Set up the CSVReader
format_in.column_names(column_names);
CSVReader reader("./tests/data/fake_data/ints_comments.csv", format_in);

// Assert that column names weren't overwritten
CSVFormat format_out = reader.get_format();
REQUIRE(reader.get_col_names() == column_names);
REQUIRE(format_out.get_delim() == ',');
REQUIRE(format_out.get_header() == 5);
}
}

// get_file_info()
Expand Down

0 comments on commit a06a03b

Please sign in to comment.