Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Allow multiple filenames per user bin. #237

Merged
merged 4 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/chopper/input_functor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ struct input_functor
seqan3::fields<seqan3::field::seq>,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

std::vector<std::string> filenames;
std::vector<std::vector<std::string>> filenames;

bool input_are_precomputed_files{false};

Expand Down
2 changes: 1 addition & 1 deletion include/chopper/layout/execute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
namespace chopper::layout
{

int execute(chopper::configuration & config, std::vector<std::string> const & filenames);
int execute(chopper::configuration & config, std::vector<std::vector<std::string>> const & filenames);

} // namespace chopper::layout
2 changes: 1 addition & 1 deletion include/chopper/layout/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
namespace chopper::layout
{

void write_user_bins_to(std::vector<std::string> const & filenames, std::ostream & stream);
void write_user_bins_to(std::vector<std::vector<std::string>> const & filenames, std::ostream & stream);

} // namespace chopper::layout
3 changes: 3 additions & 0 deletions include/chopper/sketch/check_filenames.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,7 @@ namespace chopper::sketch
//!\brief Checks the `filenames` for consistent files, either precomputed or sequence files.
void check_filenames(std::vector<std::string> const & filenames, configuration & config);

//!\overload
void check_filenames(std::vector<std::vector<std::string>> const & filenames, configuration & config);

} // namespace chopper::sketch
2 changes: 1 addition & 1 deletion include/chopper/sketch/read_data_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
namespace chopper::sketch
{

void read_data_file(configuration const & config, std::vector<std::string> & filenames);
void read_data_file(configuration const & config, std::vector<std::vector<std::string>> & filenames);

} // namespace chopper::sketch
2 changes: 1 addition & 1 deletion src/chopper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ int main(int argc, char const * argv[])

int exit_code{};

std::vector<std::string> filenames{};
std::vector<std::vector<std::string>> filenames{};

chopper::sketch::read_data_file(config, filenames);

Expand Down
24 changes: 15 additions & 9 deletions src/input_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,30 @@ void input_functor::operator()(size_t const num, seqan::hibf::insert_iterator it
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filenames[num], std::ios::binary};
for (std::string const & filename : filenames[num])
{
std::ifstream infile{filename, std::ios::binary};

while (infile.read(hash_data, hash_bytes))
it = hash;
while (infile.read(hash_data, hash_bytes))
it = hash;
}
}
else
{
sequence_file_type fin{filenames[num]};

seqan3::shape shape = seqan3::ungapped{kmer_size};
seqan3::shape const shape = seqan3::ungapped{kmer_size};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{window_size},
seqan3::seed{adjust_seed(shape.count())});

for (auto && [seq] : fin)
for (std::string const & filename : filenames[num])
{
for (auto hash_value : seq | minimizer_view)
it = hash_value;
sequence_file_type fin{filename};

for (auto && [seq] : fin)
{
for (auto hash_value : seq | minimizer_view)
it = hash_value;
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/layout/execute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
namespace chopper::layout
{

int execute(chopper::configuration & config, std::vector<std::string> const & filenames)
int execute(chopper::configuration & config, std::vector<std::vector<std::string>> const & filenames)
{
assert(config.hibf_config.number_of_user_bins > 0);

Expand Down Expand Up @@ -102,7 +102,7 @@ int execute(chopper::configuration & config, std::vector<std::string> const & fi

assert(filenames.size() == sketches.size());
for (size_t i = 0; i < filenames.size(); ++i)
sketch::write_sketch_file(filenames[i], sketches[i], config);
sketch::write_sketch_file(filenames[i][0], sketches[i], config);
}

// brief Write the output to the layout file.
Expand Down
14 changes: 11 additions & 3 deletions src/layout/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,20 @@
namespace chopper::layout
{

void write_user_bins_to(std::vector<std::string> const & filenames, std::ostream & stream)
void write_user_bins_to(std::vector<std::vector<std::string>> const & filenames, std::ostream & stream)
{
stream << chopper::prefix::meta_chopper_user_bins_start << '\n';
size_t counter{};
for (auto const & filename : filenames)
stream << seqan::hibf::prefix::meta_header << counter++ << ' ' << filename << '\n';
for (auto const & filenames_of_user_bin : filenames)
{
// the below will write lines like this:
// @0 file1.fa file2.fa
// @1 fileABC.fa
stream << seqan::hibf::prefix::meta_header << counter++;
for (std::string const & filename : filenames_of_user_bin)
stream << ' ' << filename;
stream << '\n';
}
stream << chopper::prefix::meta_chopper_user_bins_end << '\n';
}

Expand Down
6 changes: 6 additions & 0 deletions src/sketch/check_filenames.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,10 @@ void check_filenames(std::vector<std::string> const & filenames, configuration &
}
}

void check_filenames(std::vector<std::vector<std::string>> const & filenames, configuration & config)
{
for (auto const & filenames_per_user_bin : filenames)
check_filenames(filenames_per_user_bin, config);
}

} // namespace chopper::sketch
26 changes: 16 additions & 10 deletions src/sketch/read_data_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,21 @@

#include <filesystem>
#include <fstream>
#include <ranges>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>

#include <seqan3/utility/range/to.hpp>

#include <chopper/configuration.hpp>
#include <chopper/sketch/read_data_file.hpp>

namespace chopper::sketch
{

void read_data_file(configuration const & config, std::vector<std::string> & filenames)
void read_data_file(configuration const & config, std::vector<std::vector<std::string>> & filenames)
{
std::ifstream fin{config.data_file.string()};

Expand All @@ -27,18 +31,20 @@ void read_data_file(configuration const & config, std::vector<std::string> & fil
std::string line;
while (std::getline(fin, line))
{
auto tab_pos = line.find('\t');
std::vector<std::string> names;

if (tab_pos == std::string::npos)
{
std::string const filename{line.begin(), line.end()};
filenames.push_back(filename);
}
else
auto const tab_pos = line.find('\t');
std::string_view const filename_sv{line.begin(),
(tab_pos != std::string::npos) ? line.begin() + tab_pos : line.end()};

// multiple filenames may be separated by ' '
for (auto && name : std::views::split(filename_sv, ' '))
{
std::string const filename{line.begin(), line.begin() + tab_pos};
filenames.push_back(filename);
auto common_view = std::views::common(name);
names.emplace_back(common_view.begin(), common_view.end());
}

filenames.push_back(std::move(names));
}
}

Expand Down
12 changes: 7 additions & 5 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ TEST(execute_test, few_ubs)
config.disable_sketch_output = true;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

std::string const expected_file{"@CHOPPER_USER_BINS\n"
"@0 seq0\n"
"@0 seq0a seq0b\n"
"@1 seq1\n"
"@2 seq2\n"
"@3 seq3\n"
Expand Down Expand Up @@ -131,7 +132,8 @@ TEST(execute_test, set_default_tmax)
config.hibf_config.number_of_user_bins = 8;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

Expand All @@ -143,10 +145,10 @@ TEST(execute_test, many_ubs)
seqan3::test::tmp_directory tmp_dir{};
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down
34 changes: 19 additions & 15 deletions test/api/layout/execute_with_estimation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ TEST(execute_estimation_test, few_ubs)
config.output_filename = layout_file;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

Expand Down Expand Up @@ -85,10 +86,10 @@ TEST(execute_estimation_test, many_ubs)
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};
std::filesystem::path const stats_file{layout_file.string() + ".stats"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -486,11 +487,11 @@ TEST(execute_estimation_test, many_ubs_force_all)
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};
std::filesystem::path const stats_file{layout_file.string() + ".stats"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;
std::vector<size_t> many_kmer_counts;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -559,16 +560,16 @@ TEST(execute_estimation_test, with_rearrangement)
std::filesystem::path const stats_file{layout_file.string() + ".stats"};
size_t const kmer_size{15};

std::vector<std::string> filenames{};
std::vector<std::vector<std::string>> filenames{};
std::vector<std::string> hll_filenames;
std::vector<size_t> expected_kmer_counts;

for (size_t i{0}; i < 49u; ++i)
{
filenames.push_back(data("seq1.fa").string());
filenames.push_back(data("seq2.fa").string());
filenames.push_back(data("seq3.fa").string());
filenames.push_back(data("small.fa").string());
filenames.push_back({data("seq1.fa").string()});
filenames.push_back({data("seq2.fa").string()});
filenames.push_back({data("seq3.fa").string()});
filenames.push_back({data("small.fa").string()});

hll_filenames.push_back("seq1.hll");
hll_filenames.push_back("seq2.hll");
Expand All @@ -584,12 +585,15 @@ TEST(execute_estimation_test, with_rearrangement)
// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto data_input = [&](size_t const num, seqan::hibf::insert_iterator it)
{
sequence_file_type3 fin{filenames[num]};

for (auto && [seq] : fin)
for (std::string const & filename : filenames[num])
{
for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
it = hash_value;
sequence_file_type3 fin{filename};

for (auto && [seq] : fin)
{
for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
it = hash_value;
}
}
};

Expand Down
7 changes: 4 additions & 3 deletions test/api/layout/hibf_statistics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ TEST(execute_test, chopper_layout_statistics)
seqan3::test::tmp_directory tmp_dir{};
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -160,7 +160,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins)
std::filesystem::path const binning_filename{tmp_dir.path() / "output.binning"};
std::filesystem::path const stats_file{binning_filename.string() + ".stats"};

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7", "seq8", "seq9"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}, {"seq8"}, {"seq9"}};

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down
11 changes: 7 additions & 4 deletions test/api/layout/user_bin_io_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@

TEST(output, user_bins)
{
std::vector<std::string> const filenames{"file1.fa", "file2.fa", "path/to/file3.fa", "file4.fastq"};
std::vector<std::vector<std::string>> const filenames{{"file1.fa", "fileB.fa"},
{"file2.fa"},
{"path/to/file3.fa"},
{"file4.fastq"}};

std::stringstream ss{};
chopper::layout::write_user_bins_to(filenames, ss);

std::string const expected{"@CHOPPER_USER_BINS\n"
"@0 file1.fa\n"
"@0 file1.fa fileB.fa\n"
"@1 file2.fa\n"
"@2 path/to/file3.fa\n"
"@3 file4.fastq\n"
Expand All @@ -27,14 +30,14 @@ TEST(output, user_bins)
TEST(input, user_bins)
{
std::stringstream ss{"@CHOPPER_USER_BINS\n"
"@0 file1.fa\n"
"@0 file1.fa fileB.fa\n"
"@1 file2.fa\n"
"@2 path/to/file3.fa\n"
"@3 file4.fastq\n"
"@CHOPPER_USER_BINS_END\n"};

std::vector<std::vector<std::string>> filenames = chopper::layout::read_filenames_from(ss);
std::vector<std::vector<std::string>> const expected{{"file1.fa"},
std::vector<std::vector<std::string>> const expected{{"file1.fa", "fileB.fa"},
{"file2.fa"},
{"path/to/file3.fa"},
{"file4.fastq"}};
Expand Down
13 changes: 13 additions & 0 deletions test/api/sketch/check_filenames_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ TEST(check_filenames_test, sequence_filenames)
EXPECT_FALSE(config.precomputed_files);
}

TEST(check_filenames_test, overload)
{
std::vector<std::vector<std::string>> filenames{{data("seq1.fa").string()},
{data("seq2.fa").string()},
{data("seq3.fa").string()}};

chopper::configuration config;

EXPECT_NO_THROW(chopper::sketch::check_filenames(filenames, config));

EXPECT_FALSE(config.precomputed_files);
}

TEST(check_filenames_test, minimiser_filenames)
{
std::vector<std::string> filenames{data("small.minimiser").string(),
Expand Down
Loading