@@ -31,10 +31,12 @@ std::string to_string(handle_t handle) {
31
31
32
32
// ------------------------------------------------------------------------------
33
33
34
- // Returns the big-endian interger representation of the kmer in the canonical orientation.
35
- std::uint64_t kff_to_key (const std::uint8_t * kmer, size_t k, size_t bytes, const std::uint8_t * encoding) {
36
- std::vector<std::uint8_t > rc = kff_reverse_complement (kmer, k, encoding);
37
- return std::min (kff_parse (kmer, bytes), kff_parse (rc.data (), bytes));
34
+ hash_map<Haplotypes::Subchain::kmer_type, size_t >::iterator
35
+ find_kmer (hash_map<Haplotypes::Subchain::kmer_type, size_t >& counts, Haplotypes::Subchain::kmer_type kmer, size_t k) {
36
+ Haplotypes::Subchain::kmer_type rc = minimizer_reverse_complement (kmer, k);
37
+ auto forward = counts.find (kmer);
38
+ auto reverse = counts.find (rc);
39
+ return (forward != counts.end () ? forward : reverse);
38
40
}
39
41
40
42
hash_map<Haplotypes::Subchain::kmer_type, size_t > Haplotypes::kmer_counts (const std::string& kff_file, bool verbose) const {
@@ -51,6 +53,7 @@ hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const
51
53
size_t data_bytes = reader.get_var (" data_size" );
52
54
53
55
// Populate the map with the kmers we are interested in.
56
+ double checkpoint = gbwt::readTimer ();
54
57
hash_map<Subchain::kmer_type, size_t > result;
55
58
result.reserve (this ->header .total_kmers );
56
59
for (size_t chain_id = 0 ; chain_id < this ->chains .size (); chain_id++) {
@@ -63,36 +66,40 @@ hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const
63
66
}
64
67
}
65
68
if (verbose) {
66
- std::cerr << " Initialized the hash map with " << result.size () << " kmers" << std::endl;
69
+ double seconds = gbwt::readTimer () - checkpoint;
70
+ std::cerr << " Initialized the hash map with " << result.size () << " kmers in " << seconds << " seconds" << std::endl;
67
71
}
68
72
69
- // Add the counts. We read the kmers by blocks to reduce overhead.
73
+ // Add the counts. We read the kmers by blocks, which requires us to preallocate
74
+ // the buffers.
75
+ checkpoint = gbwt::readTimer ();
70
76
uint8_t * block = new uint8_t [kff_bytes (max_kmers + this ->k () - 1 )];
71
77
uint8_t * data = new uint8_t [max_kmers * data_bytes];
72
78
size_t kmer_count = 0 , block_count = 0 ;
73
79
while (reader.has_next ()) {
74
- // This function call takes references to the pointers but assumes that the
75
- // underlying buffers have been pre-allocated.
76
80
size_t n = reader.next_block (block, data);
77
- std::vector<Subchain::kmer_type> kmers = kff_recode (block, n, this ->k (), decoding);
78
- for (size_t i = 0 ; i < n; i++) {
79
- auto iter = result.find (kmers[i]);
80
- if (iter != result.end ()) {
81
- iter->second += kff_parse (data + i * data_bytes, data_bytes);
82
- } else {
83
- Subchain::kmer_type rc = minimizer_reverse_complement (kmers[i], this ->k ());
84
- iter = result.find (rc);
81
+ if (n > 1 ) {
82
+ std::vector<Subchain::kmer_type> kmers = kff_recode (block, n, this ->k (), decoding);
83
+ for (size_t i = 0 ; i < n; i++) {
84
+ auto iter = find_kmer (result, kmers[i], this ->k ());
85
85
if (iter != result.end ()) {
86
86
iter->second += kff_parse (data + i * data_bytes, data_bytes);
87
87
}
88
88
}
89
+ } else {
90
+ Subchain::kmer_type kmer = kff_recode (block, this ->k (), decoding);
91
+ auto iter = find_kmer (result, kmer, this ->k ());
92
+ if (iter != result.end ()) {
93
+ iter->second += kff_parse (data, data_bytes);
94
+ }
89
95
}
90
96
kmer_count += n; block_count++;
91
97
}
92
98
delete[] block; block = nullptr ;
93
99
delete[] data; data = nullptr ;
94
100
if (verbose) {
95
- std::cerr << " Read " << kmer_count << " kmers in " << block_count << " blocks" << std::endl;
101
+ double seconds = gbwt::readTimer () - checkpoint;
102
+ std::cerr << " Read " << kmer_count << " kmers in " << block_count << " blocks in " << seconds << " seconds" << std::endl;
96
103
}
97
104
98
105
return result;
0 commit comments