diff --git a/scripts/fingerprinter.py b/scripts/fingerprinter.py new file mode 100644 index 00000000..2e3328ec --- /dev/null +++ b/scripts/fingerprinter.py @@ -0,0 +1,26 @@ +import sys + +import dnaio + +import collections + +from sequali.report_modules import DuplicationCounts + + +def fastq_file_to_hashes(fastq_file): + with dnaio.open(sys.argv[1], open_threads=1) as reader: + for read in reader: + length = len(read) + if length < 16: + h = hash(read.sequence) + else: + h = hash(read.sequence[:16]) ^ (length >> 6) ^ hash(read.sequence[-16:]) + yield h + + +if __name__ == "__main__": + counter = collections.Counter(fastq_file_to_hashes(sys.argv[1])) + dupcounter = collections.Counter(counter.values()) + print(dict(sorted(dupcounter.items()))) + estimated_fractions = DuplicationCounts.estimated_counts_to_fractions(dupcounter.items()) + print(estimated_fractions) \ No newline at end of file diff --git a/score_to_error_rate.py b/scripts/score_to_error_rate.py similarity index 100% rename from score_to_error_rate.py rename to scripts/score_to_error_rate.py diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index 92e74f1e..67ce7be9 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3814,8 +3814,23 @@ static int DedupEstimator_add_sequence_ptr(DedupEstimator *self, uint8_t *sequence, size_t sequence_length) { - uint64_t hash = MurmurHash3_x64_64( - sequence, Py_MIN(UNIQUE_SEQUENCE_LENGTH, sequence_length)); + + uint64_t hash; + if (sequence_length < 16) { + hash = MurmurHash3_x64_64(sequence, sequence_length, 0); + } else { + /* Take 16 bytes from the beginning and the end. Some sequences may + share the beginning, so taking the end properly distuingishes them. + Also use the sequence length, but divide it by 64 so small + differences due to indel sequencing errors in the middle do not + affect the fingerprint. + Another reason for taking 16bp is that this is exactly the murmur + hash block size which allows following the fastest code path. */ + uint64_t seed = sequence_length >> 6; + uint64_t hash_front = MurmurHash3_x64_64(sequence, 16, seed); + uint64_t hash_back = MurmurHash3_x64_64(sequence + sequence_length - 16, 16, seed); + hash = hash_front ^ hash_back; + } size_t modulo_bits = self->modulo_bits; size_t ignore_mask = (1ULL << modulo_bits) - 1; if (hash & ignore_mask) { diff --git a/src/sequali/murmur3.h b/src/sequali/murmur3.h index 00aaba98..dd786872 100644 --- a/src/sequali/murmur3.h +++ b/src/sequali/murmur3.h @@ -46,14 +46,14 @@ static FORCE_INLINE uint64_t fmix64 ( uint64_t k ) //----------------------------------------------------------------------------- -static uint64_t MurmurHash3_x64_64(const void *key, size_t len) +static uint64_t MurmurHash3_x64_64(const void *key, size_t len, uint64_t seed) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; int i; - uint64_t h1 = 0; - uint64_t h2 = 0; + uint64_t h1 = seed; + uint64_t h2 = seed; uint64_t c1 = 0x87c37b91114253d5ULL; uint64_t c2 = 0x4cf5ad432745937fULL; diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py index 2cb5c724..ce787da1 100644 --- a/src/sequali/report_modules.py +++ b/src/sequali/report_modules.py @@ -840,8 +840,17 @@ def plot(self): def to_html(self): first_part = f""" - This estimates the fraction of the duplication based on - {self.tracked_unique_sequences:,} sampled unique sequences.
+ All sequences are fingerprinted based on the first 16bp, the last 16bp + and the length integer divided by 64. This means that for long + read sequences, small indel sequencing errors will most likely not + affect the fingerprint.
+
+ A subsample of the fingerprints is stored to estimate the duplication + rate. The subsample for this file consists of + {self.tracked_unique_sequences:,} fingerprints. + The paper describing the methodology can be found + + here.
Estimated remaining sequences if deduplicated: {self.remaining_fraction:.2%} """