Skip to content

Commit

Permalink
Merge pull request #110 from rhpvorderman/adjustablefingerprint
Browse files Browse the repository at this point in the history
Make deduplication module fingerprint adjustable.
  • Loading branch information
rhpvorderman authored Mar 25, 2024
2 parents d9ae002 + 797c71f commit 0f5a3e3
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 48 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
src/sequali/_version.py

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ Changelog
.. This document is user facing. Please word the changes in such a way
.. that users understand how the changes affect the new version.
version 0.6.0-dev
-----------------
+ The deduplication fingerprint that is used is now configurable from the
command line.
+ The deduplication module starts by gathering all sequences rather than half
of the sequences. This allows all sequences to be considered using a big
enough hash table.

version 0.5.1
-----------------
+ Fix a bug in the overrepresented sequence sampling where the fragments from
Expand Down
22 changes: 21 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,11 @@ Usage
[--overrepresentation-max-unique-fragments N]
[--overrepresentation-fragment-length LENGTH]
[--overrepresentation-sample-every DIVISOR]
[--deduplication-estimate-bits BITS] [-t THREADS] [--version]
[--deduplication-estimate-bits BITS]
[--fingerprint-front-length LENGTH]
[--fingerprint-back-length LENGTH]
[--fingerprint-front-offset LENGTH]
[--fingerprint-back-offset LENGTH] [-t THREADS] [--version]
INPUT
Create a quality metrics report for sequencing data.
Expand Down Expand Up @@ -132,6 +136,22 @@ Usage
estimate the deduplication rate. Maximum stored
sequences: 2 ** bits * 7 // 10. Memory required: 2 **
bits * 24. Default: 21.
--fingerprint-front-length LENGTH
Set the number of bases to be taken for the
deduplication fingerprint from the front of the
sequence. Default: 8.
--fingerprint-back-length LENGTH
Set the number of bases to be taken for the
deduplication fingerprint from the back of the
sequence. Default: 8.
--fingerprint-front-offset LENGTH
Set the offset for the front part of the deduplication
fingerprint. Useful for avoiding adapter sequences.
Default: 64.
--fingerprint-back-offset LENGTH
Set the offset for the back part of the deduplication
fingerprint. Useful for avoiding adapter sequences.
Default: 64.
-t THREADS, --threads THREADS
Number of threads to use. If greater than one sequali
will use an additional thread for gzip decompression.
Expand Down
56 changes: 51 additions & 5 deletions src/sequali/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,22 @@
import xopen

from ._qc import (
AdapterCounter, BamParser, DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS,
DEFAULT_FRAGMENT_LENGTH, DEFAULT_MAX_UNIQUE_FRAGMENTS,
DEFAULT_UNIQUE_SAMPLE_EVERY, DedupEstimator, FastqParser, NanoStats,
PerTileQuality, QCMetrics, SequenceDuplication
AdapterCounter,
BamParser,
DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS,
DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH,
DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET,
DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH,
DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET,
DEFAULT_FRAGMENT_LENGTH,
DEFAULT_MAX_UNIQUE_FRAGMENTS,
DEFAULT_UNIQUE_SAMPLE_EVERY,
DedupEstimator,
FastqParser,
NanoStats,
PerTileQuality,
QCMetrics,
SequenceDuplication
)
from ._version import __version__
from .adapters import DEFAULT_ADAPTER_FILE, adapters_from_file
Expand Down Expand Up @@ -109,6 +121,35 @@ def argument_parser() -> argparse.ArgumentParser:
f"Maximum stored sequences: 2 ** bits * 7 // 10. "
f"Memory required: 2 ** bits * 24. "
f"Default: {DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS}.")
parser.add_argument("--fingerprint-front-length", type=int,
default=DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH,
metavar="LENGTH",
help=f"Set the number of bases to be taken for the "
f"deduplication fingerprint from the front of "
f"the sequence. "
f"Default: {DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH}.")

parser.add_argument("--fingerprint-back-length", type=int,
default=DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH,
metavar="LENGTH",
help=f"Set the number of bases to be taken for the "
f"deduplication fingerprint from the back of "
f"the sequence. "
f"Default: {DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH}.")
parser.add_argument("--fingerprint-front-offset", type=int,
default=DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET,
metavar="LENGTH",
help=f"Set the offset for the front part of the "
f"deduplication fingerprint. Useful for avoiding "
f"adapter sequences. "
f"Default: {DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET}.")
parser.add_argument("--fingerprint-back-offset", type=int,
default=DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET,
metavar="LENGTH",
help=f"Set the offset for the back part of the "
f"deduplication fingerprint. Useful for avoiding "
f"adapter sequences. "
f"Default: {DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET}.")
parser.add_argument("-t", "--threads", type=int, default=2,
help="Number of threads to use. If greater than one "
"sequali will use an additional thread for gzip "
Expand All @@ -133,7 +174,12 @@ def main() -> None:
sample_every=args.overrepresentation_sample_every
)
dedup_estimator = DedupEstimator(
hash_table_size_bits=args.deduplication_estimate_bits)
hash_table_size_bits=args.deduplication_estimate_bits,
front_sequence_length=args.fingerprint_front_length,
front_sequence_offset=args.fingerprint_front_offset,
back_sequence_length=args.fingerprint_back_length,
back_sequence_offset=args.fingerprint_back_offset,
)
nanostats = NanoStats()
filename: str = args.input
threads = args.threads
Expand Down
21 changes: 19 additions & 2 deletions src/sequali/_qc.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ MAX_SEQUENCE_SIZE: int
DEFAULT_MAX_UNIQUE_FRAGMENTS: int
DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS: int
DEFAULT_FRAGMENT_LENGTH: int
DEFAULT_UNIQUE_SAMPLE_EVERY: int
DEFAULT_UNIQUE_SAMPLE_EVERY: int
DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH: int
DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH: int
DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET: int
DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET: int


class FastqRecordView:
obj: bytes
Expand Down Expand Up @@ -114,8 +119,20 @@ class DedupEstimator:
_modulo_bits: int
_hash_table_size: int
tracked_sequences: int
front_sequence_length: int
back_sequence_length: int
front_sequence_offset: int
back_sequence_offset: int

def __init__(self, hash_table_size_bits: int = 21): ...
def __init__(
self,
hash_table_size_bits: int = DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS,
*,
front_sequence_length: int = DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH,
back_sequence_length: int = DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH,
front_sequence_offset: int = DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET,
back_sequence_offset: int = DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET,
): ...
def add_sequence(self, __sequence: str) -> None: ...
def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ...
def duplication_counts(self) -> array.ArrayType: ...
Expand Down
126 changes: 102 additions & 24 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -3715,6 +3715,19 @@ Fei Xie, Michael Condict, Sandip Shete
// accurate results.
#define DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS 21

/*
Avoid the beginning and end of the sequence by at most 64 bp to avoid
any adapters. Take the 8 bp after the start offset and the 8 bp before
the end offset. This creates a small 16 bp fingerprint. Hash it using
MurmurHash. 16 bp is small and therefore relatively insensitive to
sequencing errors while still offering 4^16 or 4 billion distinct
fingerprints.
*/
#define DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH 8
#define DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH 8
#define DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET 64
#define DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET 64

// Use packing at the 4-byte boundary to save 4 bytes of storage.
#pragma pack(4)
struct EstimatorEntry {
Expand All @@ -3729,22 +3742,43 @@ typedef struct _DedupEstimatorStruct {
size_t hash_table_size;
size_t max_stored_entries;
size_t stored_entries;
size_t front_sequence_length;
size_t front_sequence_offset;
size_t back_sequence_length;
size_t back_sequence_offset;
uint8_t *fingerprint_store;
struct EstimatorEntry *hash_table;
} DedupEstimator;

static void
DedupEstimator_dealloc(DedupEstimator *self) {
PyMem_Free(self->hash_table);
PyMem_Free(self->fingerprint_store);
Py_TYPE(self)->tp_free((PyObject *)self);
}

static PyObject *
DedupEstimator__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
Py_ssize_t hash_table_size_bits = DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS;
static char *kwargnames[] = {"hash_table_size_bits", NULL};
static char *format = "|n:DedupEstimator";
Py_ssize_t front_sequence_length = DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH;
Py_ssize_t front_sequence_offset = DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET;
Py_ssize_t back_sequence_length = DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH;
Py_ssize_t back_sequence_offset = DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET;
static char *kwargnames[] = {
"hash_table_size_bits",
"front_sequence_length",
"back_sequence_length",
"front_sequence_offset",
"back_sequence_offset",
NULL
};
static char *format = "|n$nnnn:DedupEstimator";
if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames,
&hash_table_size_bits)) {
&hash_table_size_bits,
&front_sequence_length,
&back_sequence_length,
&front_sequence_offset,
&back_sequence_offset)) {
return NULL;
}
if (hash_table_size_bits < 8 || hash_table_size_bits > 58) {
Expand All @@ -3755,21 +3789,58 @@ DedupEstimator__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
);
return NULL;
}
Py_ssize_t lengths_and_offsets[4] = {
front_sequence_length,
back_sequence_length,
front_sequence_offset,
back_sequence_offset,
};
for (size_t i=0; i < 4; i++) {
if (lengths_and_offsets[i] < 0) {
PyErr_Format(
PyExc_ValueError,
"%s must be at least 0, got %zd.",
kwargnames[i+1],
lengths_and_offsets[i]
);
return NULL;
}
}
size_t fingerprint_size = front_sequence_length + back_sequence_length;
if (fingerprint_size == 0) {
PyErr_SetString(
PyExc_ValueError,
"The sum of front_sequence_length and back_sequence_length must be at least 0"
);
return NULL;
}

size_t hash_table_size = 1ULL << hash_table_size_bits;
uint8_t *fingerprint_store = PyMem_Malloc(fingerprint_size);
if (fingerprint_store == NULL) {
return PyErr_NoMemory();
}
struct EstimatorEntry *hash_table = PyMem_Calloc(hash_table_size, sizeof(struct EstimatorEntry));
if (hash_table == NULL) {
PyMem_Free(fingerprint_store);
return PyErr_NoMemory();
}
DedupEstimator *self = PyObject_New(DedupEstimator, type);
if (self == NULL) {
PyMem_Free(fingerprint_store);
PyMem_Free(hash_table);
return PyErr_NoMemory();
}
self->front_sequence_length = front_sequence_length;
self->front_sequence_offset = front_sequence_offset;
self->back_sequence_length = back_sequence_length;
self->back_sequence_offset = back_sequence_offset;
self->fingerprint_store = fingerprint_store;
self->hash_table_size = hash_table_size;
// Get about 70% occupancy max
self->max_stored_entries = (hash_table_size * 7) / 10;
self->hash_table = hash_table;
self->modulo_bits = 1;
self->modulo_bits = 0;
self->stored_entries = 0;
return (PyObject *)self;
}
Expand Down Expand Up @@ -3816,35 +3887,30 @@ DedupEstimator_increment_modulo(DedupEstimator *self)
return 0;
}

/*
Avoid the beginning and end of the sequence by at most 64 bp to avoid
any adapters. Take the 8 bp after the start offset and the 8 bp before
the end offset. This creates a small 16 bp fingerprint. Hash it using
MurmurHash. 16 bp is small and therefore relatively insensitive to
sequencing errors while still offering 4^16 or 4 billion distinct
fingerprints.
*/
#define FINGERPRINT_MAX_OFFSET 64
#define FINGERPRINT_LENGTH 16

static int
DedupEstimator_add_sequence_ptr(DedupEstimator *self,
uint8_t *sequence, size_t sequence_length)
{

uint64_t hash;
if (sequence_length < 16) {
size_t front_sequence_length = self->front_sequence_length;
size_t back_sequence_length = self->back_sequence_length;
size_t front_sequence_offset = self->front_sequence_offset;
size_t back_sequence_offset = self->back_sequence_offset;
size_t fingerprint_length = front_sequence_length + back_sequence_length;
uint8_t *fingerprint = self->fingerprint_store;
if (sequence_length <= fingerprint_length) {
hash = MurmurHash3_x64_64(sequence, sequence_length, 0);
} else {
uint64_t seed = sequence_length >> 6;
uint8_t fingerprint[FINGERPRINT_LENGTH];
size_t remainder = sequence_length - FINGERPRINT_LENGTH;
size_t offset = Py_MIN(remainder / 2, FINGERPRINT_MAX_OFFSET);
memcpy(fingerprint, sequence + offset, FINGERPRINT_LENGTH / 2);
memcpy(fingerprint + (FINGERPRINT_LENGTH / 2),
sequence + sequence_length - (offset + (FINGERPRINT_LENGTH / 2)),
(FINGERPRINT_LENGTH / 2));
hash = MurmurHash3_x64_64(fingerprint, FINGERPRINT_LENGTH, seed);
size_t remainder = sequence_length - fingerprint_length;
size_t front_offset = Py_MIN(remainder / 2, front_sequence_offset);
size_t back_offset = Py_MIN(remainder / 2, back_sequence_offset);
memcpy(fingerprint, sequence + front_offset, front_sequence_length);
memcpy(fingerprint + front_sequence_length,
sequence + sequence_length - (back_offset + back_sequence_length),
back_sequence_length);
hash = MurmurHash3_x64_64(fingerprint, fingerprint_length, seed);
}
size_t modulo_bits = self->modulo_bits;
size_t ignore_mask = (1ULL << modulo_bits) - 1;
Expand Down Expand Up @@ -3998,6 +4064,14 @@ static PyMemberDef DedupEstimator_members[] = {
READONLY, NULL},
{"tracked_sequences", T_ULONGLONG, offsetof(DedupEstimator, stored_entries),
READONLY, NULL},
{"front_sequence_length", T_ULONGLONG,
offsetof(DedupEstimator, front_sequence_length), READONLY, NULL},
{"back_sequence_length", T_ULONGLONG,
offsetof(DedupEstimator, back_sequence_length), READONLY, NULL},
{"front_sequence_offset", T_ULONGLONG,
offsetof(DedupEstimator, front_sequence_offset), READONLY, NULL},
{"back_sequence_offset", T_ULONGLONG,
offsetof(DedupEstimator, back_sequence_offset), READONLY, NULL},
{NULL},
};

Expand Down Expand Up @@ -4517,5 +4591,9 @@ PyInit__qc(void)
PyModule_AddIntMacro(m, DEFAULT_DEDUP_HASH_TABLE_SIZE_BITS);
PyModule_AddIntMacro(m, DEFAULT_FRAGMENT_LENGTH);
PyModule_AddIntMacro(m, DEFAULT_UNIQUE_SAMPLE_EVERY);
PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH);
PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH);
PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET);
PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET);
return m;
}
Loading

0 comments on commit 0f5a3e3

Please sign in to comment.