rhpvorderman · rhpvorderman · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/scripts/fingerprinter.py b/scripts/fingerprinter.py
@@ -0,0 +1,26 @@
+import sys
+
+import dnaio
+
+import collections
+
+from sequali.report_modules import DuplicationCounts
+
+
+def fastq_file_to_hashes(fastq_file):
+    with dnaio.open(sys.argv[1], open_threads=1) as reader:
+        for read in reader:
+            length = len(read)
+            if length < 16:
+                h = hash(read.sequence)
+            else:
+                h = hash(read.sequence[:16]) ^ (length >> 6) ^ hash(read.sequence[-16:])
+            yield h
+
+
+if __name__ == "__main__":
+    counter = collections.Counter(fastq_file_to_hashes(sys.argv[1]))
+    dupcounter = collections.Counter(counter.values())
+    print(dict(sorted(dupcounter.items())))
+    estimated_fractions = DuplicationCounts.estimated_counts_to_fractions(dupcounter.items())
+    print(estimated_fractions)
diff --git a/score_to_error_rate.py → scripts/score_to_error_rate.py b/score_to_error_rate.py → scripts/score_to_error_rate.py
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -3814,8 +3814,23 @@ static int
 DedupEstimator_add_sequence_ptr(DedupEstimator *self, 
                                uint8_t *sequence, size_t sequence_length) 
 {
-    uint64_t hash = MurmurHash3_x64_64(
-        sequence, Py_MIN(UNIQUE_SEQUENCE_LENGTH, sequence_length));
+
+    uint64_t hash;
+    if (sequence_length < 16) {
+        hash = MurmurHash3_x64_64(sequence, sequence_length, 0);
+    } else {
+        /* Take 16 bytes from the beginning and the end. Some sequences may 
+           share the beginning, so taking the end properly distuingishes them. 
+           Also use the sequence length, but divide it by 64 so small 
+           differences due to indel sequencing errors in the middle do not 
+           affect the fingerprint. 
+           Another reason for taking 16bp is that this is exactly the murmur
+           hash block size which allows following the fastest code path. */
+        uint64_t seed = sequence_length >> 6;
+        uint64_t hash_front = MurmurHash3_x64_64(sequence, 16, seed);
+        uint64_t hash_back = MurmurHash3_x64_64(sequence + sequence_length - 16, 16, seed);
+        hash = hash_front ^ hash_back;
+    }
     size_t modulo_bits = self->modulo_bits;
     size_t ignore_mask = (1ULL << modulo_bits) - 1;
     if (hash & ignore_mask) {

diff --git a/src/sequali/murmur3.h b/src/sequali/murmur3.h
@@ -46,14 +46,14 @@ static FORCE_INLINE uint64_t fmix64 ( uint64_t k )
 
 //-----------------------------------------------------------------------------
 
-static uint64_t MurmurHash3_x64_64(const void *key, size_t len)
+static uint64_t MurmurHash3_x64_64(const void *key, size_t len, uint64_t seed)
 {
   const uint8_t * data = (const uint8_t*)key;
   const int nblocks = len / 16;
   int i;
 
-  uint64_t h1 = 0;
-  uint64_t h2 = 0;
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
 
   uint64_t c1 = 0x87c37b91114253d5ULL;
   uint64_t c2 = 0x4cf5ad432745937fULL;

diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
@@ -840,8 +840,17 @@ def plot(self):
 
     def to_html(self):
         first_part = f"""
-        This estimates the fraction of the duplication based on
-        {self.tracked_unique_sequences:,} sampled unique sequences. <br>
+        All sequences are fingerprinted based on the first 16bp, the last 16bp
+        and the length integer divided by 64. This means that for long
+        read sequences, small indel sequencing errors will most likely not
+        affect the fingerprint. <br>
+        <br>
+        A subsample of the fingerprints is stored to estimate the duplication
+        rate. The subsample for this file consists of
+        {self.tracked_unique_sequences:,} fingerprints.
+        The paper describing the methodology can be found
+        <a href=https://www.usenix.org/system/files/conference/atc13/atc13-xie.pdf>
+        here</a>.<br>
         Estimated remaining sequences if deduplicated:
         {self.remaining_fraction:.2%}
             """