From 1ce2ac1e8f417f928f8464ec135cf7adf049cb76 Mon Sep 17 00:00:00 2001 From: Zhicheng-Liu Date: Tue, 22 Jun 2021 18:58:29 +0100 Subject: [PATCH] Reset output block after each batch when combining classic indices When combining classic indices, for each batch the combinations of rows from each constituent index are written to an output block. The output block is reused for next batch. As we use bitwise OR operation to combine rows from the constituent indices, the output block should be reset to all 0s before being reused. Otherwise, previous set bits will be carried over to next batch and accumulating false positives till the end of the batch processing loop. --- cobs/construction/classic_index.cpp | 1 + tests/classic_index_construction.cpp | 69 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/cobs/construction/classic_index.cpp b/cobs/construction/classic_index.cpp index 8a09904..5f980f2 100644 --- a/cobs/construction/classic_index.cpp +++ b/cobs/construction/classic_index.cpp @@ -322,6 +322,7 @@ void classic_combine_streams( t.active("write"); ofs.write(out_block.data(), new_row_bytes * this_batch); + std::fill(out_block.begin(), out_block.end(), '\0'); } t.stop(); } diff --git a/tests/classic_index_construction.cpp b/tests/classic_index_construction.cpp index c092f33..d675734 100644 --- a/tests/classic_index_construction.cpp +++ b/tests/classic_index_construction.cpp @@ -6,6 +6,9 @@ * All rights reserved. Published under the MIT License in the LICENSE file. ******************************************************************************/ +#include +#include + #include "test_util.hpp" #include #include @@ -21,6 +24,26 @@ static fs::path index_dir = base_dir / "index"; static fs::path index_file = base_dir / "index.cobs_classic"; static fs::path tmp_path = base_dir / "tmp"; +// Compare two files. Return true if the contents of both files are the same. +bool compare_files(const std::string& filename1, const std::string& filename2) +{ + std::ifstream file1(filename1, std::ifstream::ate | std::ifstream::binary); //open file at the end + std::ifstream file2(filename2, std::ifstream::ate | std::ifstream::binary); //open file at the end + const std::ifstream::pos_type fileSize = file1.tellg(); + + if (fileSize != file2.tellg()) { + return false; //different file size + } + + file1.seekg(0); //rewind + file2.seekg(0); //rewind + + std::istreambuf_iterator begin1(file1); + std::istreambuf_iterator begin2(file2); + + return std::equal(begin1,std::istreambuf_iterator(),begin2); //Second argument is end-of-range iterator +} + class classic_index_construction : public ::testing::Test { protected: @@ -151,4 +174,50 @@ TEST_F(classic_index_construction, combine) { } } +TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) { + // This test starts with 2 copies of the same randomly generated document. + // We build a classic index for each of these two documents. + // We then combine these two classic indices into one combined index. + // The combined index should be the same as the classic index generated + // through `cobs:classic_construct` on these two documents. + using cobs::pad_index; + fs::create_directories(index_dir); + fs::create_directories(index_dir/pad_index(0)); + fs::create_directories(index_dir/pad_index(1)); + fs::create_directories(index_dir/pad_index(2)); + + // prepare 2 copy of a randomly generated document + std::string random_doc_src_string = cobs::random_sequence(1000, 1); + auto random_docs = generate_documents_one(random_doc_src_string, 1); + generate_test_case(random_docs, "random_", input_dir/pad_index(0)); + generate_test_case(random_docs, "random_", input_dir/pad_index(1)); + + cobs::ClassicIndexParameters index_params; + index_params.false_positive_rate = 0.001; // in order to use large signature size + index_params.mem_bytes = 80; + index_params.num_threads = 1; + index_params.continue_ = true; + + // generate a classic index for each document + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(0)), + index_dir/pad_index(0)/(pad_index(0) + ".cobs_classic"), + tmp_path, index_params); + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)), + index_dir/pad_index(0)/(pad_index(1) + ".cobs_classic"), + tmp_path, index_params); + + // generate a combined index fro both classic constructed index + fs::path combined_index; + cobs::classic_combine(index_dir/pad_index(0), index_dir/pad_index(1), combined_index, + 80, 1, false); + + // generate a classic index for both docs through classic_construct + std::string classic_constructed_index = index_dir/pad_index(2)/(pad_index(0) + + ".cobs_classic"); + cobs::classic_construct(cobs::DocumentList(input_dir), classic_constructed_index, + tmp_path, index_params); + + ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index)); +} + /******************************************************************************/