diff --git a/scripts/fingerprinter.py b/scripts/fingerprinter.py
new file mode 100644
index 00000000..2e3328ec
--- /dev/null
+++ b/scripts/fingerprinter.py
@@ -0,0 +1,26 @@
+import sys
+
+import dnaio
+
+import collections
+
+from sequali.report_modules import DuplicationCounts
+
+
+def fastq_file_to_hashes(fastq_file):
+ with dnaio.open(sys.argv[1], open_threads=1) as reader:
+ for read in reader:
+ length = len(read)
+ if length < 16:
+ h = hash(read.sequence)
+ else:
+ h = hash(read.sequence[:16]) ^ (length >> 6) ^ hash(read.sequence[-16:])
+ yield h
+
+
+if __name__ == "__main__":
+ counter = collections.Counter(fastq_file_to_hashes(sys.argv[1]))
+ dupcounter = collections.Counter(counter.values())
+ print(dict(sorted(dupcounter.items())))
+ estimated_fractions = DuplicationCounts.estimated_counts_to_fractions(dupcounter.items())
+ print(estimated_fractions)
\ No newline at end of file
diff --git a/score_to_error_rate.py b/scripts/score_to_error_rate.py
similarity index 100%
rename from score_to_error_rate.py
rename to scripts/score_to_error_rate.py
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index 92e74f1e..67ce7be9 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3814,8 +3814,23 @@ static int
DedupEstimator_add_sequence_ptr(DedupEstimator *self,
uint8_t *sequence, size_t sequence_length)
{
- uint64_t hash = MurmurHash3_x64_64(
- sequence, Py_MIN(UNIQUE_SEQUENCE_LENGTH, sequence_length));
+
+ uint64_t hash;
+ if (sequence_length < 16) {
+ hash = MurmurHash3_x64_64(sequence, sequence_length, 0);
+ } else {
+ /* Take 16 bytes from the beginning and the end. Some sequences may
+ share the beginning, so taking the end properly distuingishes them.
+ Also use the sequence length, but divide it by 64 so small
+ differences due to indel sequencing errors in the middle do not
+ affect the fingerprint.
+ Another reason for taking 16bp is that this is exactly the murmur
+ hash block size which allows following the fastest code path. */
+ uint64_t seed = sequence_length >> 6;
+ uint64_t hash_front = MurmurHash3_x64_64(sequence, 16, seed);
+ uint64_t hash_back = MurmurHash3_x64_64(sequence + sequence_length - 16, 16, seed);
+ hash = hash_front ^ hash_back;
+ }
size_t modulo_bits = self->modulo_bits;
size_t ignore_mask = (1ULL << modulo_bits) - 1;
if (hash & ignore_mask) {
diff --git a/src/sequali/murmur3.h b/src/sequali/murmur3.h
index 00aaba98..dd786872 100644
--- a/src/sequali/murmur3.h
+++ b/src/sequali/murmur3.h
@@ -46,14 +46,14 @@ static FORCE_INLINE uint64_t fmix64 ( uint64_t k )
//-----------------------------------------------------------------------------
-static uint64_t MurmurHash3_x64_64(const void *key, size_t len)
+static uint64_t MurmurHash3_x64_64(const void *key, size_t len, uint64_t seed)
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
int i;
- uint64_t h1 = 0;
- uint64_t h2 = 0;
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
uint64_t c1 = 0x87c37b91114253d5ULL;
uint64_t c2 = 0x4cf5ad432745937fULL;
diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
index 2cb5c724..ce787da1 100644
--- a/src/sequali/report_modules.py
+++ b/src/sequali/report_modules.py
@@ -840,8 +840,17 @@ def plot(self):
def to_html(self):
first_part = f"""
- This estimates the fraction of the duplication based on
- {self.tracked_unique_sequences:,} sampled unique sequences.
+ All sequences are fingerprinted based on the first 16bp, the last 16bp
+ and the length integer divided by 64. This means that for long
+ read sequences, small indel sequencing errors will most likely not
+ affect the fingerprint.
+
+ A subsample of the fingerprints is stored to estimate the duplication
+ rate. The subsample for this file consists of
+ {self.tracked_unique_sequences:,} fingerprints.
+ The paper describing the methodology can be found
+
+ here.
Estimated remaining sequences if deduplicated:
{self.remaining_fraction:.2%}
"""