Skip to content

Commit

Permalink
Peformance/CMake Improvements (#27)
Browse files Browse the repository at this point in the history
Parser is now capable of just over 200 MB/sec from disk and 240 MB/sec from memory on an Intel Core i7-8550U CPU
  • Loading branch information
vincentlaucsb authored Apr 30, 2019
1 parent b5b4a72 commit d52f177
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 187 deletions.
64 changes: 7 additions & 57 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,67 +19,17 @@ else()
set(CMAKE_CXX_FLAGS_DEBUG "-Og -g -lgcov --coverage")
endif(MSVC)

message("CSV for C++ ${CMAKE_BUILD_TYPE} Build with ${CMAKE_CXX_COMPILER}")
set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/)
set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/)
set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests)

set(SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/internal/)
set(TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests)

# file(GLOB_RECURSE SOURCES include/ *.hpp *.cpp)
set(SOURCES
${SOURCE_DIR}/csv_reader.cpp
${SOURCE_DIR}/csv_reader_iterator.cpp
${SOURCE_DIR}/csv_row.cpp
${SOURCE_DIR}/csv_stat.cpp
${SOURCE_DIR}/csv_utility.cpp
${SOURCE_DIR}/data_type.cpp
${SOURCE_DIR}/giant_string_buffer.cpp
)
set(TEST_SOURCES
${TEST_DIR}/catch.hpp
${TEST_DIR}/main.cpp
${TEST_DIR}/test_csv_iterator.cpp
${TEST_DIR}/test_csv_buffer.cpp
${TEST_DIR}/test_csv_row.cpp
${TEST_DIR}/test_csv_stat.cpp
${TEST_DIR}/test_read_csv.cpp
${TEST_DIR}/test_write_csv.cpp
${TEST_DIR}/test_data_type.cpp
)

include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
include_directories(${TEST_DIR})
include_directories(${CSV_INCLUDE_DIR})

## Main Library
add_library(csv STATIC ${SOURCES})
set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX)
add_subdirectory(${CSV_SOURCE_DIR})

## Executables
add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/programs/csv_info.cpp)
target_link_libraries(csv_info csv)

add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_bench.cpp)
target_link_libraries(csv_bench csv)

add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_guess_bench.cpp)
target_link_libraries(csv_guess_bench csv)

add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/programs/csv_stats.cpp)
target_link_libraries(csv_stats csv)

add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/programs/csv_generator.cpp)
target_link_libraries(csv_generator csv)

add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/programs/data_type_bench.cpp)
target_link_libraries(data_type_bench csv)
add_subdirectory("programs")

## Tests
add_executable(csv_test ${TEST_SOURCES})
target_link_libraries(csv_test csv)
add_custom_command(
TARGET csv_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${TEST_DIR}/data $<TARGET_FILE_DIR:csv_test>/tests/data
)

enable_testing()
add_test(test csv_test)
add_subdirectory("tests")
14 changes: 14 additions & 0 deletions include/internal/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
add_library(csv STATIC "")

target_sources(csv
PRIVATE
csv_reader.cpp
csv_reader_iterator.cpp
csv_row.cpp
csv_stat.cpp
csv_utility.cpp
data_type.cpp
giant_string_buffer.cpp
)

set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX)
109 changes: 63 additions & 46 deletions include/internal/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ namespace csv {
return CSV_NOT_FOUND;
}

void CSVReader::feed(std::unique_ptr<char[]>&& buff) {
this->feed(csv::string_view(buff.get()));
void CSVReader::feed(WorkItem&& buff) {
this->feed( csv::string_view(buff.first.get(), buff.second) );
}

void CSVReader::feed(csv::string_view in) {
Expand Down Expand Up @@ -296,56 +296,72 @@ namespace csv {
this->record_buffer->reserve(in.size());
std::string& _record_buffer = *(this->record_buffer.get());

for (size_t i = 0; i < in.size(); i++) {
if (!quote_escape) {
switch (this->parse_flags[in[i] + 128]) {
case NOT_SPECIAL:
_record_buffer +=in[i];
break;
const size_t in_size = in.size();
for (size_t i = 0; i < in_size; i++) {
switch (this->parse_flags[in[i] + 128]) {
case DELIMITER:
this->split_buffer.push_back(this->record_buffer.size());
break;
if (!quote_escape) {
this->split_buffer.push_back(this->record_buffer.size());
break;
}
case NEWLINE:
// End of record -> Write record
if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
++i;
this->write_record();
break;
default: // Quote
// Case: Previous character was delimiter or newline
if (i) { // Don't deref past beginning
auto prev_ch = this->parse_flags[in[i - 1] + 128];
if (prev_ch >= DELIMITER) quote_escape = true;
if (!quote_escape) {
// End of record -> Write record
if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
++i;
this->write_record();
break;
}
case NOT_SPECIAL: {
// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
// sequences, use the loop below to avoid having to go through the outer
// switch statement as much as possible
#if __cplusplus >= 201703L
size_t start = i;
while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) {
i++;
}

_record_buffer += in.substr(start, i - start + 1);
#else
_record_buffer += in[i];

while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) {
_record_buffer += in[++i];
}
#endif

break;
}
}
else {
switch (this->parse_flags[in[i] + 128]) {
case NOT_SPECIAL:
case DELIMITER:
case NEWLINE:
// Treat as a regular character
_record_buffer +=in[i];
break;
default: // Quote
if (!quote_escape) {
// Don't deref past beginning
if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) {
// Case: Previous character was delimiter or newline
quote_escape = true;
}

break;
}

auto next_ch = this->parse_flags[in[i + 1] + 128];
if (next_ch >= DELIMITER) {
// Case: Delim or newline => end of field
quote_escape = false;
break;
}
else {
// Case: Escaped quote
_record_buffer +=in[i];

if (next_ch == QUOTE)
++i; // Case: Two consecutive quotes
else if (this->strict)
throw std::runtime_error("Unescaped single quote around line " +
std::to_string(this->correct_rows) + " near:\n" +
std::string(in.substr(i, 100)));
}
}
// Case: Escaped quote
_record_buffer += in[i];

if (next_ch == QUOTE)
++i; // Case: Two consecutive quotes
else if (this->strict)
throw std::runtime_error("Unescaped single quote around line " +
std::to_string(this->correct_rows) + " near:\n" +
std::string(in.substr(i, 100)));
break;
}
}

Expand Down Expand Up @@ -415,7 +431,7 @@ namespace csv {
this->feed_buffer.pop_front();

// Nullptr --> Die
if (!in) break;
if (!in.first) break;

lock.unlock(); // Release lock
this->feed(std::move(in));
Expand Down Expand Up @@ -455,11 +471,12 @@ namespace csv {
char * result = std::fgets(line_buffer, internals::PAGE_SIZE, this->infile);
if (result == NULL) break;
line_buffer += std::strlen(line_buffer);
size_t current_strlen = line_buffer - buffer.get();

if ((line_buffer - buffer.get()) >= 0.9 * BUFFER_UPPER_LIMIT) {
if (current_strlen >= 0.9 * BUFFER_UPPER_LIMIT) {
processed += (line_buffer - buffer.get());
std::unique_lock<std::mutex> lock{ this->feed_lock };
this->feed_buffer.push_back(std::move(buffer));
this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), current_strlen));
this->feed_cond.notify_one();

buffer = std::unique_ptr<char[]>(new char[BUFFER_UPPER_LIMIT]); // New pointer
Expand All @@ -470,8 +487,8 @@ namespace csv {

// Feed remaining bits
std::unique_lock<std::mutex> lock{ this->feed_lock };
this->feed_buffer.push_back(std::move(buffer));
this->feed_buffer.push_back(nullptr); // Termination signal
this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), line_buffer - buffer.get()));
this->feed_buffer.push_back(std::make_pair<>(nullptr, 0)); // Termination signal
this->feed_cond.notify_one();
lock.unlock();
worker.join();
Expand Down
8 changes: 5 additions & 3 deletions include/internal/csv_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ namespace csv {
NEWLINE
};

using WorkItem = std::pair<std::unique_ptr<char[]>, size_t>; /**<
@brief A string buffer and its size */

std::vector<CSVReader::ParseFlags> make_flags() const;

internals::GiantStringBuffer record_buffer; /**<
Expand Down Expand Up @@ -195,7 +198,7 @@ namespace csv {

/** @name Multi-Threaded File Reading Functions */
///@{
void feed(std::unique_ptr<char[]>&&); /**< @brief Helper for read_csv_worker() */
void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */
void read_csv(
const std::string& filename,
const size_t& bytes = internals::ITERATION_CHUNK_SIZE
Expand All @@ -208,8 +211,7 @@ namespace csv {
std::FILE* infile = nullptr; /**< @brief Current file handle.
Destroyed by ~CSVReader(). */

std::deque<std::unique_ptr<char[]>>
feed_buffer; /**< @brief Message queue for worker */
std::deque<WorkItem> feed_buffer; /**< @brief Message queue for worker */

std::mutex feed_lock; /**< @brief Allow only one worker to write */
std::condition_variable feed_cond; /**< @brief Wake up worker */
Expand Down
17 changes: 17 additions & 0 deletions programs/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp)
target_link_libraries(csv_info csv)

add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp)
target_link_libraries(csv_bench csv)

add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp)
target_link_libraries(csv_guess_bench csv)

add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp)
target_link_libraries(csv_stats csv)

add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp)
target_link_libraries(csv_generator csv)

add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp)
target_link_libraries(data_type_bench csv)
Loading

0 comments on commit d52f177

Please sign in to comment.