Skip to content

Commit

Permalink
Remove code copy in index construction.
Browse files Browse the repository at this point in the history
  • Loading branch information
haowenz authored and swiftgenomics committed Nov 13, 2022
1 parent b7ca7e1 commit abab51f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/chromap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "temp_mapping.h"
#include "utils.h"

#define CHROMAP_VERSION "0.2.3-r426"
#define CHROMAP_VERSION "0.2.3-r427"

namespace chromap {

Expand Down
42 changes: 21 additions & 21 deletions src/index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ namespace chromap {

void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
const double real_start_time = GetRealTime();

std::vector<Minimizer> minimizers;
minimizers.reserve(reference.GetNumBases() / window_size_ * 2);

std::cerr << "Collecting minimizers.\n";
MinimizerGenerator minimizer_generator(kmer_size_, window_size_);
for (uint32_t sequence_index = 0; sequence_index < num_sequences;
++sequence_index) {
Expand All @@ -22,30 +24,40 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
}
std::cerr << "Collected " << minimizers.size() << " minimizers.\n";

std::cerr << "Sorting minimizers.\n";
std::stable_sort(minimizers.begin(), minimizers.end());
std::cerr << "Sorted minimizers.\n";
std::cerr << "Sorted all minimizers.\n";

const uint32_t num_minimizers = minimizers.size();

assert(num_minimizers > 0);
// TODO: check this assert!
// Here I make sure the # minimizers is less than the limit of signed int32,
// so that I can use int to store position later.
assert(num_minimizers != 0 && num_minimizers <= INT_MAX);
assert(num_minimizers <= INT_MAX);

occurrence_table_.reserve(num_minimizers);

uint64_t previous_key = minimizers[0].GetHashKey();
uint32_t num_previous_minimizer_occurrences = 0;
uint64_t num_nonsingletons = 0;
uint32_t num_singletons = 0;

for (uint32_t ti = 0; ti < num_minimizers; ++ti) {
uint64_t current_key = minimizers[ti].GetHashKey();
for (uint32_t ti = 0; ti <= num_minimizers; ++ti) {
const bool is_last_iteration = ti == num_minimizers;
const uint64_t current_key =
is_last_iteration ? previous_key + 1 : minimizers[ti].GetHashKey();

if (current_key != previous_key) {
int khash_return_code = 0;
khiter_t khash_iterator =
kh_put(k64, lookup_table_, previous_key << 1, &khash_return_code);
assert(khash_return_code != -1 && khash_return_code != 0);

if (num_previous_minimizer_occurrences == 1) { // singleton
if (num_previous_minimizer_occurrences == 1) {
// We set the lowest bit of the key value to 1 if the minimizer only
// occurs once. And the occurrence is directly saved in the lookup
// table.
kh_key(lookup_table_, khash_iterator) |= 1;
kh_value(lookup_table_, khash_iterator) = occurrence_table_.back();
occurrence_table_.pop_back();
Expand All @@ -60,26 +72,14 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
num_previous_minimizer_occurrences++;
}

if (is_last_iteration) {
break;
}

occurrence_table_.push_back(minimizers[ti].GetMinimizer());
previous_key = current_key;
}

int khash_return_code = 0;
khiter_t khash_iterator =
kh_put(k64, lookup_table_, previous_key << 1, &khash_return_code);
assert(khash_return_code != -1 && khash_return_code != 0);

if (num_previous_minimizer_occurrences == 1) { // singleton
kh_key(lookup_table_, khash_iterator) |= 1;
kh_value(lookup_table_, khash_iterator) = occurrence_table_.back();
occurrence_table_.pop_back();
++num_singletons;
} else {
kh_value(lookup_table_, khash_iterator) =
(num_nonsingletons << 32) | num_previous_minimizer_occurrences;
num_nonsingletons += num_previous_minimizer_occurrences;
}

assert(num_nonsingletons + num_singletons == num_minimizers);

std::cerr << "Kmer size: " << kmer_size_ << ", window size: " << window_size_
Expand Down

0 comments on commit abab51f

Please sign in to comment.