diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index a2ec7562..0b3733ee 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3819,6 +3819,14 @@ DedupEstimator_add_sequence_ptr(DedupEstimator *self, if (sequence_length < 16) { hash = MurmurHash3_x64_64(sequence, sequence_length, 0); } else { + /* Take 16 bytes from the beginning and the end. Some sequences may + share the beginning, so taking the end properly distuingishes them. + Also use the sequence length, but divide it by 64 so small + differences due to indel sequencing errors in the middle do not + affect the fingerprint. + Another reason for taking 16bp is that this is exactly the murmur + hash block size and the compiler can inline a fast version when + a 16 constant is used. */ uint64_t seed = sequence_length >> 6; uint64_t hash_front = MurmurHash3_x64_64(sequence, 16, seed); uint64_t hash_back = MurmurHash3_x64_64(sequence + sequence_length - 16, 16, seed);