From 75a1b2495fa738ae0c0e0b3d0c8bb6f4a54baefb Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Tue, 26 Sep 2023 09:13:33 +0200 Subject: [PATCH 1/4] [FEATURE] Allow multiple filenames per user bin. --- include/chopper/input_functor.hpp | 2 +- include/chopper/layout/execute.hpp | 2 +- include/chopper/layout/output.hpp | 2 +- include/chopper/sketch/check_filenames.hpp | 3 ++ include/chopper/sketch/read_data_file.hpp | 2 +- src/chopper.cpp | 2 +- src/input_functor.cpp | 24 ++++++++----- src/layout/execute.cpp | 4 +-- src/layout/output.cpp | 14 ++++++-- src/sketch/check_filenames.cpp | 6 ++++ src/sketch/read_data_file.cpp | 26 ++++++++------ test/api/layout/execute_layout_test.cpp | 9 ++--- .../layout/execute_with_estimation_test.cpp | 34 +++++++++++-------- test/api/layout/hibf_statistics_test.cpp | 7 ++-- test/api/layout/user_bin_io_test.cpp | 2 +- test/api/sketch/check_filenames_test.cpp | 13 +++++++ test/api/sketch/read_data_file_test.cpp | 29 ++++++++++++++-- 17 files changed, 126 insertions(+), 55 deletions(-) diff --git a/include/chopper/input_functor.hpp b/include/chopper/input_functor.hpp index dd37c974..2bb2850d 100644 --- a/include/chopper/input_functor.hpp +++ b/include/chopper/input_functor.hpp @@ -31,7 +31,7 @@ struct input_functor seqan3::fields, seqan3::type_list>; - std::vector filenames; + std::vector> filenames; bool input_are_precomputed_files{false}; diff --git a/include/chopper/layout/execute.hpp b/include/chopper/layout/execute.hpp index 84642c8e..137c45c0 100644 --- a/include/chopper/layout/execute.hpp +++ b/include/chopper/layout/execute.hpp @@ -15,6 +15,6 @@ namespace chopper::layout { -int execute(chopper::configuration & config, std::vector const & filenames); +int execute(chopper::configuration & config, std::vector> const & filenames); } // namespace chopper::layout diff --git a/include/chopper/layout/output.hpp b/include/chopper/layout/output.hpp index a66aafd7..ac183769 100644 --- a/include/chopper/layout/output.hpp +++ b/include/chopper/layout/output.hpp @@ -14,6 +14,6 @@ namespace chopper::layout { -void write_user_bins_to(std::vector const & filenames, std::ostream & stream); +void write_user_bins_to(std::vector> const & filenames, std::ostream & stream); } // namespace chopper::layout diff --git a/include/chopper/sketch/check_filenames.hpp b/include/chopper/sketch/check_filenames.hpp index c07e57c4..bb2700fb 100644 --- a/include/chopper/sketch/check_filenames.hpp +++ b/include/chopper/sketch/check_filenames.hpp @@ -18,4 +18,7 @@ namespace chopper::sketch //!\brief Checks the `filenames` for consistent files, either precomputed or sequence files. void check_filenames(std::vector const & filenames, configuration & config); +//!\overload +void check_filenames(std::vector> const & filenames, configuration & config); + } // namespace chopper::sketch diff --git a/include/chopper/sketch/read_data_file.hpp b/include/chopper/sketch/read_data_file.hpp index a63f9f50..6e7c8388 100644 --- a/include/chopper/sketch/read_data_file.hpp +++ b/include/chopper/sketch/read_data_file.hpp @@ -15,6 +15,6 @@ namespace chopper::sketch { -void read_data_file(configuration const & config, std::vector & filenames); +void read_data_file(configuration const & config, std::vector> & filenames); } // namespace chopper::sketch diff --git a/src/chopper.cpp b/src/chopper.cpp index d8f002fd..8234242c 100644 --- a/src/chopper.cpp +++ b/src/chopper.cpp @@ -49,7 +49,7 @@ int main(int argc, char const * argv[]) int exit_code{}; - std::vector filenames{}; + std::vector> filenames{}; chopper::sketch::read_data_file(config, filenames); diff --git a/src/input_functor.cpp b/src/input_functor.cpp index f0fdf63a..dea1b35e 100644 --- a/src/input_functor.cpp +++ b/src/input_functor.cpp @@ -32,24 +32,30 @@ void input_functor::operator()(size_t const num, seqan::hibf::insert_iterator it char * const hash_data{reinterpret_cast(&hash)}; std::streamsize const hash_bytes{sizeof(hash)}; - std::ifstream infile{filenames[num], std::ios::binary}; + for (std::string const & filename : filenames[num]) + { + std::ifstream infile{filename, std::ios::binary}; - while (infile.read(hash_data, hash_bytes)) - it = hash; + while (infile.read(hash_data, hash_bytes)) + it = hash; + } } else { - sequence_file_type fin{filenames[num]}; - - seqan3::shape shape = seqan3::ungapped{kmer_size}; + seqan3::shape const shape = seqan3::ungapped{kmer_size}; auto minimizer_view = seqan3::views::minimiser_hash(shape, seqan3::window_size{window_size}, seqan3::seed{adjust_seed(shape.count())}); - for (auto && [seq] : fin) + for (std::string const & filename : filenames[num]) { - for (auto hash_value : seq | minimizer_view) - it = hash_value; + sequence_file_type fin{filename}; + + for (auto && [seq] : fin) + { + for (auto hash_value : seq | minimizer_view) + it = hash_value; + } } } } diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 7d90f796..f3e6ef67 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -33,7 +33,7 @@ namespace chopper::layout { -int execute(chopper::configuration & config, std::vector const & filenames) +int execute(chopper::configuration & config, std::vector> const & filenames) { assert(config.hibf_config.number_of_user_bins > 0); @@ -102,7 +102,7 @@ int execute(chopper::configuration & config, std::vector const & fi assert(filenames.size() == sketches.size()); for (size_t i = 0; i < filenames.size(); ++i) - sketch::write_sketch_file(filenames[i], sketches[i], config); + sketch::write_sketch_file(filenames[i][0], sketches[i], config); } // brief Write the output to the layout file. diff --git a/src/layout/output.cpp b/src/layout/output.cpp index f0436bee..71dbd8b8 100644 --- a/src/layout/output.cpp +++ b/src/layout/output.cpp @@ -19,12 +19,20 @@ namespace chopper::layout { -void write_user_bins_to(std::vector const & filenames, std::ostream & stream) +void write_user_bins_to(std::vector> const & filenames, std::ostream & stream) { stream << chopper::prefix::meta_chopper_user_bins_start << '\n'; size_t counter{}; - for (auto const & filename : filenames) - stream << seqan::hibf::prefix::meta_header << counter++ << ' ' << filename << '\n'; + for (auto const & filenames_of_user_bin : filenames) + { + // the below will write lines like this: + // @0 file1.fa file2.fa + // @1 fileABC.fa + stream << seqan::hibf::prefix::meta_header << counter++; + for (std::string const & filename : filenames_of_user_bin) + stream << ' ' << filename; + stream << '\n'; + } stream << chopper::prefix::meta_chopper_user_bins_end << '\n'; } diff --git a/src/sketch/check_filenames.cpp b/src/sketch/check_filenames.cpp index 3b89eb48..2bf48d44 100644 --- a/src/sketch/check_filenames.cpp +++ b/src/sketch/check_filenames.cpp @@ -70,4 +70,10 @@ void check_filenames(std::vector const & filenames, configuration & } } +void check_filenames(std::vector> const & filenames, configuration & config) +{ + for (auto const & filenames_per_user_bin : filenames) + check_filenames(filenames_per_user_bin, config); +} + } // namespace chopper::sketch diff --git a/src/sketch/read_data_file.cpp b/src/sketch/read_data_file.cpp index ad52e311..40bc057c 100644 --- a/src/sketch/read_data_file.cpp +++ b/src/sketch/read_data_file.cpp @@ -7,17 +7,21 @@ #include #include +#include #include #include +#include #include +#include + #include #include namespace chopper::sketch { -void read_data_file(configuration const & config, std::vector & filenames) +void read_data_file(configuration const & config, std::vector> & filenames) { std::ifstream fin{config.data_file.string()}; @@ -27,18 +31,20 @@ void read_data_file(configuration const & config, std::vector & fil std::string line; while (std::getline(fin, line)) { - auto tab_pos = line.find('\t'); + std::vector names; - if (tab_pos == std::string::npos) - { - std::string const filename{line.begin(), line.end()}; - filenames.push_back(filename); - } - else + auto const tab_pos = line.find('\t'); + std::string_view const filename_sv{line.begin(), + (tab_pos != std::string::npos) ? line.begin() + tab_pos : line.end()}; + + // multiple filenames may be separated by ' ' + for (auto && name : std::views::split(filename_sv, ' ')) { - std::string const filename{line.begin(), line.begin() + tab_pos}; - filenames.push_back(filename); + auto common_view = std::views::common(name); + names.emplace_back(common_view.begin(), common_view.end()); } + + filenames.push_back(std::move(names)); } } diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 4d156b4a..d9eddd1b 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -40,7 +40,7 @@ TEST(execute_test, few_ubs) config.disable_sketch_output = true; config.hibf_config.disable_estimate_union = true; // also disables rearrangement - std::vector filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"}; + std::vector> filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; chopper::layout::execute(config, filenames); @@ -131,7 +131,8 @@ TEST(execute_test, set_default_tmax) config.hibf_config.number_of_user_bins = 8; config.hibf_config.disable_estimate_union = true; // also disables rearrangement - std::vector filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"}; + std::vector> + filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; chopper::layout::execute(config, filenames); @@ -143,10 +144,10 @@ TEST(execute_test, many_ubs) seqan3::test::tmp_directory tmp_dir{}; std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; - std::vector many_filenames; + std::vector> many_filenames; for (size_t i{0}; i < 96u; ++i) - many_filenames.push_back(seqan3::detail::to_string("seq", i)); + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index 8be3fd73..50658941 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -49,7 +49,8 @@ TEST(execute_estimation_test, few_ubs) config.output_filename = layout_file; config.hibf_config.disable_estimate_union = true; // also disables rearrangement - std::vector filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"}; + std::vector> + filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; chopper::layout::execute(config, filenames); @@ -85,10 +86,10 @@ TEST(execute_estimation_test, many_ubs) std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; std::filesystem::path const stats_file{layout_file.string() + ".stats"}; - std::vector many_filenames; + std::vector> many_filenames; for (size_t i{0}; i < 96u; ++i) - many_filenames.push_back(seqan3::detail::to_string("seq", i)); + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) @@ -486,11 +487,11 @@ TEST(execute_estimation_test, many_ubs_force_all) std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; std::filesystem::path const stats_file{layout_file.string() + ".stats"}; - std::vector many_filenames; + std::vector> many_filenames; std::vector many_kmer_counts; for (size_t i{0}; i < 96u; ++i) - many_filenames.push_back(seqan3::detail::to_string("seq", i)); + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) @@ -559,16 +560,16 @@ TEST(execute_estimation_test, with_rearrangement) std::filesystem::path const stats_file{layout_file.string() + ".stats"}; size_t const kmer_size{15}; - std::vector filenames{}; + std::vector> filenames{}; std::vector hll_filenames; std::vector expected_kmer_counts; for (size_t i{0}; i < 49u; ++i) { - filenames.push_back(data("seq1.fa").string()); - filenames.push_back(data("seq2.fa").string()); - filenames.push_back(data("seq3.fa").string()); - filenames.push_back(data("small.fa").string()); + filenames.push_back({data("seq1.fa").string()}); + filenames.push_back({data("seq2.fa").string()}); + filenames.push_back({data("seq3.fa").string()}); + filenames.push_back({data("small.fa").string()}); hll_filenames.push_back("seq1.hll"); hll_filenames.push_back("seq2.hll"); @@ -584,12 +585,15 @@ TEST(execute_estimation_test, with_rearrangement) // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto data_input = [&](size_t const num, seqan::hibf::insert_iterator it) { - sequence_file_type3 fin{filenames[num]}; - - for (auto && [seq] : fin) + for (std::string const & filename : filenames[num]) { - for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) - it = hash_value; + sequence_file_type3 fin{filename}; + + for (auto && [seq] : fin) + { + for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) + it = hash_value; + } } }; diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index 765ad9e8..17ca276a 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -106,10 +106,10 @@ TEST(execute_test, chopper_layout_statistics) seqan3::test::tmp_directory tmp_dir{}; std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; - std::vector many_filenames; + std::vector> many_filenames; for (size_t i{0}; i < 96u; ++i) - many_filenames.push_back(seqan3::detail::to_string("seq", i)); + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) @@ -160,7 +160,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins) std::filesystem::path const binning_filename{tmp_dir.path() / "output.binning"}; std::filesystem::path const stats_file{binning_filename.string() + ".stats"}; - std::vector filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7", "seq8", "seq9"}; + std::vector> + filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}, {"seq8"}, {"seq9"}}; // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) diff --git a/test/api/layout/user_bin_io_test.cpp b/test/api/layout/user_bin_io_test.cpp index daf04978..771311d1 100644 --- a/test/api/layout/user_bin_io_test.cpp +++ b/test/api/layout/user_bin_io_test.cpp @@ -9,7 +9,7 @@ TEST(output, user_bins) { - std::vector const filenames{"file1.fa", "file2.fa", "path/to/file3.fa", "file4.fastq"}; + std::vector> const filenames{{"file1.fa"}, {"file2.fa"}, {"path/to/file3.fa"}, {"file4.fastq"}}; std::stringstream ss{}; chopper::layout::write_user_bins_to(filenames, ss); diff --git a/test/api/sketch/check_filenames_test.cpp b/test/api/sketch/check_filenames_test.cpp index af2df860..48299e7b 100644 --- a/test/api/sketch/check_filenames_test.cpp +++ b/test/api/sketch/check_filenames_test.cpp @@ -27,6 +27,19 @@ TEST(check_filenames_test, sequence_filenames) EXPECT_FALSE(config.precomputed_files); } +TEST(check_filenames_test, overload) +{ + std::vector> filenames{{data("seq1.fa").string()}, + {data("seq2.fa").string()}, + {data("seq3.fa").string()}}; + + chopper::configuration config; + + EXPECT_NO_THROW(chopper::sketch::check_filenames(filenames, config)); + + EXPECT_FALSE(config.precomputed_files); +} + TEST(check_filenames_test, minimiser_filenames) { std::vector filenames{data("small.minimiser").string(), diff --git a/test/api/sketch/read_data_file_test.cpp b/test/api/sketch/read_data_file_test.cpp index ffd92ad7..a5b05c6a 100644 --- a/test/api/sketch/read_data_file_test.cpp +++ b/test/api/sketch/read_data_file_test.cpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include @@ -20,7 +22,7 @@ TEST(read_data_file_test, file_open_error) { chopper::configuration config{}; - std::vector filenames{}; + std::vector> filenames{}; config.data_file = data("non_existing.file"); EXPECT_THROW(chopper::sketch::read_data_file(config, filenames), std::runtime_error); } @@ -28,11 +30,32 @@ TEST(read_data_file_test, file_open_error) TEST(read_data_file_test, small_example) { chopper::configuration config; - std::vector filenames{}; + std::vector> filenames{}; config.data_file = data("seqinfo.tsv"); chopper::sketch::read_data_file(config, filenames); - std::vector expected_filenames{"file1", "file2", "file3", "file4", "file5"}; + std::vector> expected_filenames{{"file1"}, {"file2"}, {"file3"}, {"file4"}, {"file5"}}; + EXPECT_RANGE_EQ(filenames, expected_filenames); +} + +TEST(read_data_file_test, multi_filenames) +{ + chopper::configuration config; + std::vector> filenames{}; + + seqan3::test::tmp_directory tmp_dir{}; + config.data_file = tmp_dir.path() / "multi_files.txt"; + + { + std::ofstream of{config.data_file}; + of << "file1a file1b\nfile2\nfile3a file3b file3c\n"; + } + + chopper::sketch::read_data_file(config, filenames); + + std::vector> expected_filenames{{"file1a", "file1b"}, + {"file2"}, + {"file3a", "file3b", "file3c"}}; EXPECT_RANGE_EQ(filenames, expected_filenames); } From d4330fbf9af11369010fc09b223eb234994f08dd Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 4 Dec 2023 19:24:41 +0100 Subject: [PATCH 2/4] [TEST] Add multi filename to util_display_layout_test --- test/cli/util_display_layout_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/cli/util_display_layout_test.cpp b/test/cli/util_display_layout_test.cpp index 3d95fc67..1ba5a25e 100644 --- a/test/cli/util_display_layout_test.cpp +++ b/test/cli/util_display_layout_test.cpp @@ -25,9 +25,9 @@ std::string get_layout_with_correct_filenames(std::string_view const seq1_filena + std::string{seq1_filename} + "\n" // "@1 " - + seq2_filename.data() - + "\n" // - "@2 " + + seq2_filename.data() + " " + seq2_filename.data() + // ensure that multi filename works + +"\n" // + "@2 " + seq3_filename.data() + "\n" // "@3 " From dba6672ff76fe70bb0628c53af516a52b6c36d34 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 4 Dec 2023 19:45:24 +0100 Subject: [PATCH 3/4] [TEST] Add multi filename example to execute_layout_test. --- test/api/layout/execute_layout_test.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index d9eddd1b..696a976e 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -40,12 +40,13 @@ TEST(execute_test, few_ubs) config.disable_sketch_output = true; config.hibf_config.disable_estimate_union = true; // also disables rearrangement - std::vector> filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; + std::vector> + filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; chopper::layout::execute(config, filenames); std::string const expected_file{"@CHOPPER_USER_BINS\n" - "@0 seq0\n" + "@0 seq0a seq0b\n" "@1 seq1\n" "@2 seq2\n" "@3 seq3\n" From 8521f011041cb7574e606cd60dfc3a9785db5a1f Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 4 Dec 2023 19:53:16 +0100 Subject: [PATCH 4/4] [TEST] Adapt user_bin_io_test to multi filenames. --- test/api/layout/user_bin_io_test.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/api/layout/user_bin_io_test.cpp b/test/api/layout/user_bin_io_test.cpp index 771311d1..8e9fc97c 100644 --- a/test/api/layout/user_bin_io_test.cpp +++ b/test/api/layout/user_bin_io_test.cpp @@ -9,13 +9,16 @@ TEST(output, user_bins) { - std::vector> const filenames{{"file1.fa"}, {"file2.fa"}, {"path/to/file3.fa"}, {"file4.fastq"}}; + std::vector> const filenames{{"file1.fa", "fileB.fa"}, + {"file2.fa"}, + {"path/to/file3.fa"}, + {"file4.fastq"}}; std::stringstream ss{}; chopper::layout::write_user_bins_to(filenames, ss); std::string const expected{"@CHOPPER_USER_BINS\n" - "@0 file1.fa\n" + "@0 file1.fa fileB.fa\n" "@1 file2.fa\n" "@2 path/to/file3.fa\n" "@3 file4.fastq\n" @@ -27,14 +30,14 @@ TEST(output, user_bins) TEST(input, user_bins) { std::stringstream ss{"@CHOPPER_USER_BINS\n" - "@0 file1.fa\n" + "@0 file1.fa fileB.fa\n" "@1 file2.fa\n" "@2 path/to/file3.fa\n" "@3 file4.fastq\n" "@CHOPPER_USER_BINS_END\n"}; std::vector> filenames = chopper::layout::read_filenames_from(ss); - std::vector> const expected{{"file1.fa"}, + std::vector> const expected{{"file1.fa", "fileB.fa"}, {"file2.fa"}, {"path/to/file3.fa"}, {"file4.fastq"}};