Skip to content

Use a smarter read finger print for deduplication estimation #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 7, 2023
26 changes: 26 additions & 0 deletions scripts/fingerprinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys

import dnaio

import collections

from sequali.report_modules import DuplicationCounts


def fastq_file_to_hashes(fastq_file):
with dnaio.open(sys.argv[1], open_threads=1) as reader:
for read in reader:
length = len(read)
if length < 16:
h = hash(read.sequence)
else:
h = hash(read.sequence[:16]) ^ (length >> 6) ^ hash(read.sequence[-16:])
yield h


if __name__ == "__main__":
counter = collections.Counter(fastq_file_to_hashes(sys.argv[1]))
dupcounter = collections.Counter(counter.values())
print(dict(sorted(dupcounter.items())))
estimated_fractions = DuplicationCounts.estimated_counts_to_fractions(dupcounter.items())
print(estimated_fractions)
File renamed without changes.
19 changes: 17 additions & 2 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -3814,8 +3814,23 @@ static int
DedupEstimator_add_sequence_ptr(DedupEstimator *self,
uint8_t *sequence, size_t sequence_length)
{
uint64_t hash = MurmurHash3_x64_64(
sequence, Py_MIN(UNIQUE_SEQUENCE_LENGTH, sequence_length));

uint64_t hash;
if (sequence_length < 16) {
hash = MurmurHash3_x64_64(sequence, sequence_length, 0);
} else {
/* Take 16 bytes from the beginning and the end. Some sequences may
share the beginning, so taking the end properly distuingishes them.
Also use the sequence length, but divide it by 64 so small
differences due to indel sequencing errors in the middle do not
affect the fingerprint.
Another reason for taking 16bp is that this is exactly the murmur
hash block size which allows following the fastest code path. */
uint64_t seed = sequence_length >> 6;
uint64_t hash_front = MurmurHash3_x64_64(sequence, 16, seed);
uint64_t hash_back = MurmurHash3_x64_64(sequence + sequence_length - 16, 16, seed);
hash = hash_front ^ hash_back;
}
size_t modulo_bits = self->modulo_bits;
size_t ignore_mask = (1ULL << modulo_bits) - 1;
if (hash & ignore_mask) {
Expand Down
6 changes: 3 additions & 3 deletions src/sequali/murmur3.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ static FORCE_INLINE uint64_t fmix64 ( uint64_t k )

//-----------------------------------------------------------------------------

static uint64_t MurmurHash3_x64_64(const void *key, size_t len)
static uint64_t MurmurHash3_x64_64(const void *key, size_t len, uint64_t seed)
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
int i;

uint64_t h1 = 0;
uint64_t h2 = 0;
uint64_t h1 = seed;
uint64_t h2 = seed;

uint64_t c1 = 0x87c37b91114253d5ULL;
uint64_t c2 = 0x4cf5ad432745937fULL;
Expand Down
13 changes: 11 additions & 2 deletions src/sequali/report_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,8 +840,17 @@ def plot(self):

def to_html(self):
first_part = f"""
This estimates the fraction of the duplication based on
{self.tracked_unique_sequences:,} sampled unique sequences. <br>
All sequences are fingerprinted based on the first 16bp, the last 16bp
and the length integer divided by 64. This means that for long
read sequences, small indel sequencing errors will most likely not
affect the fingerprint. <br>
<br>
A subsample of the fingerprints is stored to estimate the duplication
rate. The subsample for this file consists of
{self.tracked_unique_sequences:,} fingerprints.
The paper describing the methodology can be found
<a href=https://www.usenix.org/system/files/conference/atc13/atc13-xie.pdf>
here</a>.<br>
Estimated remaining sequences if deduplicated:
{self.remaining_fraction:.2%}
"""
Expand Down