Skip to content

Commit 5207546

Browse files
committed
Update KFF API, optimize kmer reading
1 parent aeb9ced commit 5207546

File tree

2 files changed

+25
-18
lines changed

2 files changed

+25
-18
lines changed

src/recombinator.cpp

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ std::string to_string(handle_t handle) {
3131

3232
//------------------------------------------------------------------------------
3333

34-
// Returns the big-endian interger representation of the kmer in the canonical orientation.
35-
std::uint64_t kff_to_key(const std::uint8_t* kmer, size_t k, size_t bytes, const std::uint8_t* encoding) {
36-
std::vector<std::uint8_t> rc = kff_reverse_complement(kmer, k, encoding);
37-
return std::min(kff_parse(kmer, bytes), kff_parse(rc.data(), bytes));
34+
hash_map<Haplotypes::Subchain::kmer_type, size_t>::iterator
35+
find_kmer(hash_map<Haplotypes::Subchain::kmer_type, size_t>& counts, Haplotypes::Subchain::kmer_type kmer, size_t k) {
36+
Haplotypes::Subchain::kmer_type rc = minimizer_reverse_complement(kmer, k);
37+
auto forward = counts.find(kmer);
38+
auto reverse = counts.find(rc);
39+
return (forward != counts.end() ? forward : reverse);
3840
}
3941

4042
hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const std::string& kff_file, bool verbose) const {
@@ -51,6 +53,7 @@ hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const
5153
size_t data_bytes = reader.get_var("data_size");
5254

5355
// Populate the map with the kmers we are interested in.
56+
double checkpoint = gbwt::readTimer();
5457
hash_map<Subchain::kmer_type, size_t> result;
5558
result.reserve(this->header.total_kmers);
5659
for (size_t chain_id = 0; chain_id < this->chains.size(); chain_id++) {
@@ -63,36 +66,40 @@ hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const
6366
}
6467
}
6568
if (verbose) {
66-
std::cerr << "Initialized the hash map with " << result.size() << " kmers" << std::endl;
69+
double seconds = gbwt::readTimer() - checkpoint;
70+
std::cerr << "Initialized the hash map with " << result.size() << " kmers in " << seconds << " seconds" << std::endl;
6771
}
6872

69-
// Add the counts. We read the kmers by blocks to reduce overhead.
73+
// Add the counts. We read the kmers by blocks, which requires us to preallocate
74+
// the buffers.
75+
checkpoint = gbwt::readTimer();
7076
uint8_t* block = new uint8_t[kff_bytes(max_kmers + this->k() - 1)];
7177
uint8_t* data = new uint8_t[max_kmers * data_bytes];
7278
size_t kmer_count = 0, block_count = 0;
7379
while (reader.has_next()) {
74-
// This function call takes references to the pointers but assumes that the
75-
// underlying buffers have been pre-allocated.
7680
size_t n = reader.next_block(block, data);
77-
std::vector<Subchain::kmer_type> kmers = kff_recode(block, n, this->k(), decoding);
78-
for (size_t i = 0; i < n; i++) {
79-
auto iter = result.find(kmers[i]);
80-
if (iter != result.end()) {
81-
iter->second += kff_parse(data + i * data_bytes, data_bytes);
82-
} else {
83-
Subchain::kmer_type rc = minimizer_reverse_complement(kmers[i], this->k());
84-
iter = result.find(rc);
81+
if (n > 1) {
82+
std::vector<Subchain::kmer_type> kmers = kff_recode(block, n, this->k(), decoding);
83+
for (size_t i = 0; i < n; i++) {
84+
auto iter = find_kmer(result, kmers[i], this->k());
8585
if (iter != result.end()) {
8686
iter->second += kff_parse(data + i * data_bytes, data_bytes);
8787
}
8888
}
89+
} else {
90+
Subchain::kmer_type kmer = kff_recode(block, this->k(), decoding);
91+
auto iter = find_kmer(result, kmer, this->k());
92+
if (iter != result.end()) {
93+
iter->second += kff_parse(data, data_bytes);
94+
}
8995
}
9096
kmer_count += n; block_count++;
9197
}
9298
delete[] block; block = nullptr;
9399
delete[] data; data = nullptr;
94100
if (verbose) {
95-
std::cerr << "Read " << kmer_count << " kmers in " << block_count << " blocks" << std::endl;
101+
double seconds = gbwt::readTimer() - checkpoint;
102+
std::cerr << "Read " << kmer_count << " kmers in " << block_count << " blocks in " << seconds << " seconds" << std::endl;
96103
}
97104

98105
return result;

0 commit comments

Comments
 (0)