From 4b4bc2e763c6ebebd2c81af16b9ab56ad9632ead Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 19 Nov 2024 16:15:15 +0100 Subject: [PATCH 1/7] Rename SequenceDuplication to OverrpresentedSequences --- src/sequali/__init__.py | 4 +- src/sequali/__main__.py | 16 ++-- src/sequali/_qc.pyi | 2 +- src/sequali/_qcmodule.c | 127 +++++++++++++++-------------- src/sequali/report_modules.py | 19 +++-- tests/test_sequence_duplication.py | 84 +++++++++---------- 6 files changed, 134 insertions(+), 118 deletions(-) diff --git a/src/sequali/__init__.py b/src/sequali/__init__.py index cea6b5d..4cfa811 100644 --- a/src/sequali/__init__.py +++ b/src/sequali/__init__.py @@ -17,7 +17,7 @@ from ._qc import A, C, G, N, T from ._qc import ( AdapterCounter, BamParser, FastqParser, FastqRecordArrayView, - FastqRecordView, PerTileQuality, QCMetrics, SequenceDuplication + FastqRecordView, OverrepresentedSequences, PerTileQuality, QCMetrics, ) from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX, TABLE_SIZE from ._version import __version__ @@ -32,7 +32,7 @@ "FastqRecordArrayView", "PerTileQuality", "QCMetrics", - "SequenceDuplication", + "OverrepresentedSequences", "NUMBER_OF_NUCS", "NUMBER_OF_PHREDS", "PHRED_MAX", diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py index 0c7568e..ad3ca3b 100644 --- a/src/sequali/__main__.py +++ b/src/sequali/__main__.py @@ -34,9 +34,9 @@ DedupEstimator, InsertSizeMetrics, NanoStats, + OverrepresentedSequences, PerTileQuality, QCMetrics, - SequenceDuplication ) from ._version import __version__ from .adapters import DEFAULT_ADAPTER_FILE, adapters_from_file @@ -188,7 +188,7 @@ def main() -> None: metrics1 = QCMetrics() per_tile_quality1 = PerTileQuality() nanostats1 = NanoStats() - sequence_duplication1 = SequenceDuplication( + overrepresented_sequences1 = OverrepresentedSequences( max_unique_fragments=args.overrepresentation_max_unique_fragments, fragment_length=args.overrepresentation_fragment_length, sample_every=args.overrepresentation_sample_every @@ -219,7 +219,7 @@ def main() -> None: insert_size_metrics = InsertSizeMetrics() metrics2 = QCMetrics() per_tile_quality2 = PerTileQuality() - sequence_duplication2 = SequenceDuplication( + overrepresented_sequences2 = OverrepresentedSequences( max_unique_fragments=args.overrepresentation_max_unique_fragments, fragment_length=args.overrepresentation_fragment_length, sample_every=args.overrepresentation_sample_every @@ -227,7 +227,7 @@ def main() -> None: else: metrics2 = None per_tile_quality2 = None - sequence_duplication2 = None + overrepresented_sequences2 = None insert_size_metrics = None with contextlib.ExitStack() as exit_stack: reader1 = NGSFile(args.input, threads - 1) @@ -253,7 +253,7 @@ def main() -> None: for record_array1 in reader1: metrics1.add_record_array(record_array1) per_tile_quality1.add_record_array(record_array1) - sequence_duplication1.add_record_array(record_array1) + overrepresented_sequences1.add_record_array(record_array1) nanostats1.add_record_array(record_array1) if paired: record_array2 = reader2.read(len(record_array1)) @@ -274,7 +274,7 @@ def main() -> None: insert_size_metrics.add_record_array_pair(record_array1, record_array2) # type: ignore # noqa: E501 metrics2.add_record_array(record_array2) # type: ignore # noqa: E501 per_tile_quality2.add_record_array(record_array2) # type: ignore # noqa: E501 - sequence_duplication2.add_record_array(record_array2) # type: ignore # noqa: E501 + overrepresented_sequences2.add_record_array(record_array2) # type: ignore # noqa: E501 else: adapter_counter1.add_record_array(record_array1) # type: ignore # noqa: E501 dedup_estimator.add_record_array(record_array1) @@ -289,14 +289,14 @@ def main() -> None: metrics=metrics1, adapter_counter=adapter_counter1, per_tile_quality=per_tile_quality1, - sequence_duplication=sequence_duplication1, + sequence_duplication=overrepresented_sequences1, dedup_estimator=dedup_estimator, nanostats=nanostats1, insert_size_metrics=insert_size_metrics, filename_reverse=args.input_reverse, metrics_reverse=metrics2, per_tile_quality_reverse=per_tile_quality2, - sequence_duplication_reverse=sequence_duplication2, + sequence_duplication_reverse=overrepresented_sequences2, adapters=adapters, fraction_threshold=fraction_threshold, min_threshold=min_threshold, diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi index 06fd9ba..731bd2e 100644 --- a/src/sequali/_qc.pyi +++ b/src/sequali/_qc.pyi @@ -94,7 +94,7 @@ class PerTileQuality: def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ... def get_tile_counts(self) -> List[Tuple[int, List[float], List[int]]]: ... -class SequenceDuplication: +class OverrepresentedSequences: number_of_sequences: int sampled_sequences: int collected_unique_fragments: int diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index bb8642e..add44e7 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3135,9 +3135,9 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence) } } -/************************* - * SEQUENCE DUPLICATION * - *************************/ +/**************************** + * OverrepresentedSequences * + ****************************/ /* A module that cuts the sequence in bits of k size. The canonical (lowest) representation of the bit is used. @@ -3152,7 +3152,7 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence) #define DEFAULT_FRAGMENT_LENGTH 21 #define DEFAULT_UNIQUE_SAMPLE_EVERY 8 -typedef struct _SequenceDuplicationStruct { +typedef struct _OverrepresentedSequencesStruct { PyObject_HEAD size_t fragment_length; uint64_t number_of_sequences; @@ -3166,10 +3166,10 @@ typedef struct _SequenceDuplicationStruct { uint64_t number_of_unique_fragments; uint64_t total_fragments; size_t sample_every; -} SequenceDuplication; +} OverrepresentedSequences; static void -SequenceDuplication_dealloc(SequenceDuplication *self) +OverrepresentedSequences_dealloc(OverrepresentedSequences *self) { PyMem_Free(self->staging_hash_table); PyMem_Free(self->hashes); @@ -3178,14 +3178,14 @@ SequenceDuplication_dealloc(SequenceDuplication *self) } static PyObject * -SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) +OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) { Py_ssize_t max_unique_fragments = DEFAULT_MAX_UNIQUE_FRAGMENTS; Py_ssize_t fragment_length = DEFAULT_FRAGMENT_LENGTH; Py_ssize_t sample_every = DEFAULT_UNIQUE_SAMPLE_EVERY; static char *kwargnames[] = {"max_unique_fragments", "fragment_length", "sample_every", NULL}; - static char *format = "|nnn:SequenceDuplication"; + static char *format = "|nnn:OverrepresentedSequences"; if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &max_unique_fragments, &fragment_length, &sample_every)) { @@ -3221,7 +3221,7 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyMem_Free(counts); return PyErr_NoMemory(); } - SequenceDuplication *self = PyObject_New(SequenceDuplication, type); + OverrepresentedSequences *self = PyObject_New(OverrepresentedSequences, type); if (self == NULL) { PyMem_Free(hashes); PyMem_Free(counts); @@ -3243,7 +3243,7 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) } static void -Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash) +Sequence_duplication_insert_hash(OverrepresentedSequences *self, uint64_t hash) { uint64_t hash_to_index_int = self->hash_table_size - 1; uint64_t *hashes = self->hashes; @@ -3271,7 +3271,8 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash) } static int -SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size) +OverrepresentedSequences_resize_staging(OverrepresentedSequences *self, + uint64_t new_size) { if (new_size <= self->staging_hash_table_size) { return 0; @@ -3310,7 +3311,8 @@ add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size, } static int -SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) +OverrepresentedSequences_add_meta(OverrepresentedSequences *self, + struct FastqMeta *meta) { if (self->number_of_sequences % self->sample_every != 0) { self->number_of_sequences += 1; @@ -3347,7 +3349,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5)); size_t staging_hash_size = 1ULL << staging_hash_bits; if (staging_hash_size > self->staging_hash_table_size) { - if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) { + if (OverrepresentedSequences_resize_staging(self, staging_hash_size) < 0) { return -1; } } @@ -3404,7 +3406,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) return 0; } -PyDoc_STRVAR(SequenceDuplication_add_read__doc__, +PyDoc_STRVAR(OverrepresentedSequences_add_read__doc__, "add_read($self, read, /)\n" "--\n" "\n" @@ -3413,10 +3415,11 @@ PyDoc_STRVAR(SequenceDuplication_add_read__doc__, " read\n" " A FastqRecordView object.\n"); -#define SequenceDuplication_add_read_method METH_O +#define OverrepresentedSequences_add_read_method METH_O static PyObject * -SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read) +OverrepresentedSequences_add_read(OverrepresentedSequences *self, + FastqRecordView *read) { if (!FastqRecordView_CheckExact(read)) { PyErr_Format(PyExc_TypeError, @@ -3424,13 +3427,13 @@ SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read) Py_TYPE(read)->tp_name); return NULL; } - if (SequenceDuplication_add_meta(self, &read->meta) != 0) { + if (OverrepresentedSequences_add_meta(self, &read->meta) != 0) { return NULL; } Py_RETURN_NONE; } -PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__, +PyDoc_STRVAR(OverrepresentedSequences_add_record_array__doc__, "add_record_array($self, record_array, /)\n" "--\n" "\n" @@ -3439,11 +3442,11 @@ PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__, " record_array\n" " A FastqRecordArrayView object.\n"); -#define SequenceDuplication_add_record_array_method METH_O +#define OverrepresentedSequences_add_record_array_method METH_O static PyObject * -SequenceDuplication_add_record_array(SequenceDuplication *self, - FastqRecordArrayView *record_array) +OverrepresentedSequences_add_record_array(OverrepresentedSequences *self, + FastqRecordArrayView *record_array) { if (!FastqRecordArrayView_CheckExact(record_array)) { PyErr_Format( @@ -3455,24 +3458,24 @@ SequenceDuplication_add_record_array(SequenceDuplication *self, Py_ssize_t number_of_records = Py_SIZE(record_array); struct FastqMeta *records = record_array->records; for (Py_ssize_t i = 0; i < number_of_records; i++) { - if (SequenceDuplication_add_meta(self, records + i) != 0) { + if (OverrepresentedSequences_add_meta(self, records + i) != 0) { return NULL; } } Py_RETURN_NONE; } -PyDoc_STRVAR(SequenceDuplication_sequence_counts__doc__, +PyDoc_STRVAR(OverrepresentedSequences_sequence_counts__doc__, "sequence_counts($self, /)\n" "--\n" "\n" "Get a dictionary with sequence counts \n"); -#define SequenceDuplication_sequence_counts_method METH_NOARGS +#define OverrepresentedSequences_sequence_counts_method METH_NOARGS static PyObject * -SequenceDuplication_sequence_counts(SequenceDuplication *self, - PyObject *Py_UNUSED(ignore)) +OverrepresentedSequences_sequence_counts(OverrepresentedSequences *self, + PyObject *Py_UNUSED(ignore)) { PyObject *count_dict = PyDict_New(); if (count_dict == NULL) { @@ -3511,7 +3514,7 @@ SequenceDuplication_sequence_counts(SequenceDuplication *self, } PyDoc_STRVAR( - SequenceDuplication_overrepresented_sequences__doc__, + OverrepresentedSequences_overrepresented_sequences__doc__, "overrepresented_sequences($self, threshold=0.001)\n" "--\n" "\n" @@ -3534,19 +3537,21 @@ PyDoc_STRVAR( "high " " numbers of sequences."); -#define SequenceDuplication_overrepresented_sequences_method \ +#define OverrepresentedSequences_overrepresented_sequences_method \ METH_VARARGS | METH_KEYWORDS static PyObject * -SequenceDuplication_overrepresented_sequences(SequenceDuplication *self, - PyObject *args, PyObject *kwargs) +OverrepresentedSequences_overrepresented_sequences(OverrepresentedSequences *self, + PyObject *args, + PyObject *kwargs) { double threshold = 0.0001; // 0.01 % Py_ssize_t min_threshold = 1; Py_ssize_t max_threshold = PY_SSIZE_T_MAX; static char *kwargnames[] = {"threshold_fraction", "min_threshold", "max_threshold", NULL}; - static char *format = "|dnn:SequenceDuplication.overrepresented_sequences"; + static char *format = + "|dnn:OverrepresentedSequences.overrepresented_sequences"; if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &threshold, &min_threshold, &max_threshold)) { return NULL; @@ -3627,51 +3632,53 @@ SequenceDuplication_overrepresented_sequences(SequenceDuplication *self, return NULL; } -static PyMethodDef SequenceDuplication_methods[] = { - {"add_read", (PyCFunction)SequenceDuplication_add_read, - SequenceDuplication_add_read_method, SequenceDuplication_add_read__doc__}, - {"add_record_array", (PyCFunction)SequenceDuplication_add_record_array, - SequenceDuplication_add_record_array_method, - SequenceDuplication_add_record_array__doc__}, - {"sequence_counts", (PyCFunction)SequenceDuplication_sequence_counts, - SequenceDuplication_sequence_counts_method, - SequenceDuplication_sequence_counts__doc__}, +static PyMethodDef OverrepresentedSequences_methods[] = { + {"add_read", (PyCFunction)OverrepresentedSequences_add_read, + OverrepresentedSequences_add_read_method, + OverrepresentedSequences_add_read__doc__}, + {"add_record_array", (PyCFunction)OverrepresentedSequences_add_record_array, + OverrepresentedSequences_add_record_array_method, + OverrepresentedSequences_add_record_array__doc__}, + {"sequence_counts", (PyCFunction)OverrepresentedSequences_sequence_counts, + OverrepresentedSequences_sequence_counts_method, + OverrepresentedSequences_sequence_counts__doc__}, {"overrepresented_sequences", - (PyCFunction)(void (*)(void))SequenceDuplication_overrepresented_sequences, - SequenceDuplication_overrepresented_sequences_method, - SequenceDuplication_overrepresented_sequences__doc__}, + (PyCFunction)(void (*)(void))OverrepresentedSequences_overrepresented_sequences, + OverrepresentedSequences_overrepresented_sequences_method, + OverrepresentedSequences_overrepresented_sequences__doc__}, {NULL}, }; -static PyMemberDef SequenceDuplication_members[] = { +static PyMemberDef OverrepresentedSequences_members[] = { {"number_of_sequences", T_ULONGLONG, - offsetof(SequenceDuplication, number_of_sequences), READONLY, + offsetof(OverrepresentedSequences, number_of_sequences), READONLY, "The total number of sequences submitted."}, {"sampled_sequences", T_ULONGLONG, - offsetof(SequenceDuplication, sampled_sequences), READONLY, + offsetof(OverrepresentedSequences, sampled_sequences), READONLY, "The total number of sequences that were analysed."}, {"collected_unique_fragments", T_ULONGLONG, - offsetof(SequenceDuplication, number_of_unique_fragments), READONLY, + offsetof(OverrepresentedSequences, number_of_unique_fragments), READONLY, "The number of unique fragments collected."}, {"max_unique_fragments", T_ULONGLONG, - offsetof(SequenceDuplication, max_unique_fragments), READONLY, + offsetof(OverrepresentedSequences, max_unique_fragments), READONLY, "The maximum number of unique sequences stored in the object."}, - {"fragment_length", T_BYTE, offsetof(SequenceDuplication, fragment_length), + {"fragment_length", T_BYTE, offsetof(OverrepresentedSequences, fragment_length), READONLY, "The length of the sampled sequences"}, - {"sample_every", T_PYSSIZET, offsetof(SequenceDuplication, sample_every), + {"sample_every", T_PYSSIZET, offsetof(OverrepresentedSequences, sample_every), READONLY, "One in this many reads is sampled"}, - {"total_fragments", T_ULONGLONG, offsetof(SequenceDuplication, total_fragments), - READONLY, "Total number of fragments."}, + {"total_fragments", T_ULONGLONG, + offsetof(OverrepresentedSequences, total_fragments), READONLY, + "Total number of fragments."}, {NULL}, }; -static PyTypeObject SequenceDuplication_Type = { - .tp_name = "_qc.SequenceDuplication", - .tp_basicsize = sizeof(SequenceDuplication), - .tp_dealloc = (destructor)(SequenceDuplication_dealloc), - .tp_new = (newfunc)SequenceDuplication__new__, - .tp_members = SequenceDuplication_members, - .tp_methods = SequenceDuplication_methods, +static PyTypeObject OverrepresentedSequences_Type = { + .tp_name = "_qc.OverrepresentedSequences", + .tp_basicsize = sizeof(OverrepresentedSequences), + .tp_dealloc = (destructor)(OverrepresentedSequences_dealloc), + .tp_new = (newfunc)OverrepresentedSequences__new__, + .tp_members = OverrepresentedSequences_members, + .tp_methods = OverrepresentedSequences_methods, }; /******************* @@ -5193,7 +5200,7 @@ PyInit__qc(void) if (python_module_add_type(m, &PerTileQuality_Type) != 0) { return NULL; } - if (python_module_add_type(m, &SequenceDuplication_Type) != 0) { + if (python_module_add_type(m, &OverrepresentedSequences_Type) != 0) { return NULL; } if (python_module_add_type(m, &DedupEstimator_Type) != 0) { diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py index d539c73..0c9af2e 100644 --- a/src/sequali/report_modules.py +++ b/src/sequali/report_modules.py @@ -35,9 +35,16 @@ import pygal.style # type: ignore from ._qc import A, C, G, N, T -from ._qc import (AdapterCounter, DedupEstimator, - INSERT_SIZE_MAX_ADAPTER_STORE_SIZE, InsertSizeMetrics, - NanoStats, PerTileQuality, QCMetrics, SequenceDuplication) +from ._qc import ( + AdapterCounter, + DedupEstimator, + INSERT_SIZE_MAX_ADAPTER_STORE_SIZE, + InsertSizeMetrics, + NanoStats, + OverrepresentedSequences, + PerTileQuality, + QCMetrics, +) from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX from ._version import __version__ from .adapters import Adapter @@ -1495,7 +1502,7 @@ def to_html(self) -> str: @classmethod def from_sequence_duplication( cls, - seqdup: SequenceDuplication, + seqdup: OverrepresentedSequences, fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD, min_threshold: int = DEFAULT_MIN_THRESHOLD, max_threshold: int = DEFAULT_MAX_THRESHOLD, @@ -2131,7 +2138,7 @@ def calculate_stats( filename: str, metrics: QCMetrics, per_tile_quality: PerTileQuality, - sequence_duplication: SequenceDuplication, + sequence_duplication: OverrepresentedSequences, dedup_estimator: DedupEstimator, nanostats: NanoStats, adapters: List[Adapter], @@ -2140,7 +2147,7 @@ def calculate_stats( insert_size_metrics: Optional[InsertSizeMetrics] = None, metrics_reverse: Optional[QCMetrics] = None, per_tile_quality_reverse: Optional[PerTileQuality] = None, - sequence_duplication_reverse: Optional[SequenceDuplication] = None, + sequence_duplication_reverse: Optional[OverrepresentedSequences] = None, graph_resolution: int = 200, fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD, min_threshold: int = DEFAULT_MIN_THRESHOLD, diff --git a/tests/test_sequence_duplication.py b/tests/test_sequence_duplication.py index b7cee7f..0801ac7 100644 --- a/tests/test_sequence_duplication.py +++ b/tests/test_sequence_duplication.py @@ -19,7 +19,7 @@ import pytest -from sequali import FastqRecordView, SequenceDuplication +from sequali import FastqRecordView, OverrepresentedSequences def view_from_sequence(sequence: str) -> FastqRecordView: @@ -33,9 +33,11 @@ def view_from_sequence(sequence: str) -> FastqRecordView: def test_sequence_duplication(): max_unique_fragments = 100_000 fragment_length = 31 - seqdup = SequenceDuplication(max_unique_fragments=max_unique_fragments, - fragment_length=fragment_length, - sample_every=1) + seqdup = OverrepresentedSequences( + max_unique_fragments=max_unique_fragments, + fragment_length=fragment_length, + sample_every=1 + ) # Create unique sequences by using all combinations of ACGT for the amount # of letters that is necessary to completely saturate the maximum unique # sequences @@ -59,18 +61,18 @@ def test_sequence_duplication(): def test_sequence_duplication_add_read_no_view(): - seqdup = SequenceDuplication() + seqs = OverrepresentedSequences() with pytest.raises(TypeError) as error: - seqdup.add_read(b"ACGT") # type: ignore + seqs.add_read(b"ACGT") # type: ignore error.match("FastqRecordView") error.match("bytes") @pytest.mark.parametrize("threshold", [-0.1, 1.1]) def test_sequence_duplication_overrepresented_sequences_faulty_threshold(threshold): - seqdup = SequenceDuplication() + seqs = OverrepresentedSequences() with pytest.raises(ValueError) as error: - seqdup.overrepresented_sequences(threshold_fraction=threshold) + seqs.overrepresented_sequences(threshold_fraction=threshold) error.match(str(threshold)) error.match("between") error.match("0.0") @@ -79,20 +81,20 @@ def test_sequence_duplication_overrepresented_sequences_faulty_threshold(thresho def test_sequence_duplication_overrepresented_sequences(): fragment_length = 31 - seqdup = SequenceDuplication(sample_every=1, fragment_length=fragment_length) + seqs = OverrepresentedSequences(sample_every=1, fragment_length=fragment_length) for i in range(100): - seqdup.add_read(view_from_sequence("A" * fragment_length)) + seqs.add_read(view_from_sequence("A" * fragment_length)) for i in range(200): - seqdup.add_read(view_from_sequence("C" * fragment_length)) + seqs.add_read(view_from_sequence("C" * fragment_length)) for i in range(2000): - seqdup.add_read(view_from_sequence("G" * fragment_length)) + seqs.add_read(view_from_sequence("G" * fragment_length)) for i in range(10): - seqdup.add_read(view_from_sequence("T" * fragment_length)) - seqdup.add_read(view_from_sequence("C" * (fragment_length - 1) + "A")) + seqs.add_read(view_from_sequence("T" * fragment_length)) + seqs.add_read(view_from_sequence("C" * (fragment_length - 1) + "A")) for i in range(100_000 - (100 + 200 + 2000 + 10 + 1)): # Count up to 100_000 to get nice fractions for all the sequences - seqdup.add_read(view_from_sequence("A" * (fragment_length - 1) + "C")) - overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.001) + seqs.add_read(view_from_sequence("A" * (fragment_length - 1) + "C")) + overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.001) assert overrepresented[0][2] == "A" * (fragment_length - 1) + "C" assert overrepresented[1][2] == "C" * fragment_length assert overrepresented[1][0] == 2200 @@ -100,14 +102,14 @@ def test_sequence_duplication_overrepresented_sequences(): assert overrepresented[2][1] == 110 / 100_000 # Assert no other sequences recorded as overrepresented. assert len(overrepresented) == 3 - overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.00001) + overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.00001) assert overrepresented[-1][2] == "C" * (fragment_length - 1) + "A" - overrepresented = seqdup.overrepresented_sequences( + overrepresented = seqs.overrepresented_sequences( threshold_fraction=0.00001, min_threshold=2, ) assert overrepresented[-1][2] == "A" * fragment_length - overrepresented = seqdup.overrepresented_sequences( + overrepresented = seqs.overrepresented_sequences( threshold_fraction=0.1, min_threshold=2, max_threshold=1000, @@ -118,13 +120,13 @@ def test_sequence_duplication_overrepresented_sequences(): def test_sequence_duplication_case_insensitive(): fragment_length = 31 - seqdup = SequenceDuplication(fragment_length=fragment_length, sample_every=1) - seqdup.add_read(view_from_sequence("aaTTaca" * 5)) - seqdup.add_read(view_from_sequence("AAttACA" * 5)) - seqcounts = seqdup.sequence_counts() - assert seqdup.number_of_sequences == 2 - assert seqdup.total_fragments == 4 - assert seqdup.collected_unique_fragments == 2 + seqs = OverrepresentedSequences(fragment_length=fragment_length, sample_every=1) + seqs.add_read(view_from_sequence("aaTTaca" * 5)) + seqs.add_read(view_from_sequence("AAttACA" * 5)) + seqcounts = seqs.sequence_counts() + assert seqs.number_of_sequences == 2 + assert seqs.total_fragments == 4 + assert seqs.collected_unique_fragments == 2 assert len(seqcounts) == 2 assert seqcounts[("AATTACA" * 5)[:fragment_length]] == 2 assert seqcounts[("AATTACA" * 5)[-fragment_length:]] == 2 @@ -132,13 +134,13 @@ def test_sequence_duplication_case_insensitive(): @pytest.mark.parametrize("divisor", list(range(1, 21))) def test_sequence_duplication_sampling_rate(divisor): - seqdup = SequenceDuplication(sample_every=divisor) + seqs = OverrepresentedSequences(sample_every=divisor) read = view_from_sequence("AAAA") number_of_sequences = 10_000 for i in range(number_of_sequences): - seqdup.add_read(read) - assert seqdup.number_of_sequences == number_of_sequences - assert seqdup.sampled_sequences == (number_of_sequences + divisor - 1) // divisor + seqs.add_read(read) + assert seqs.number_of_sequences == number_of_sequences + assert seqs.sampled_sequences == (number_of_sequences + divisor - 1) // divisor @pytest.mark.parametrize(["sequence", "result"], [ @@ -150,29 +152,29 @@ def test_sequence_duplication_sampling_rate(divisor): ("GATTACGATTAC", {"ATC": 1, "GTA": 1}), ]) def test_sequence_duplication_all_fragments(sequence, result): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) - seqdup.add_read(view_from_sequence(sequence)) - seq_counts = seqdup.sequence_counts() + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) + seqs.add_read(view_from_sequence(sequence)) + seq_counts = seqs.sequence_counts() assert seq_counts == result def test_very_short_sequence(): # With 32 byte load this will overflow the used memory. - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) - seqdup.add_read(view_from_sequence("ACT")) - assert seqdup.sequence_counts() == {"ACT": 1} + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) + seqs.add_read(view_from_sequence("ACT")) + assert seqs.sequence_counts() == {"ACT": 1} def test_non_iupac_warning(): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) with pytest.warns(UserWarning, match="KKK"): - seqdup.add_read(view_from_sequence("KKK")) + seqs.add_read(view_from_sequence("KKK")) def test_valid_does_not_warn_for_n(): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) with warnings.catch_warnings(): warnings.simplefilter("error") - seqdup.add_read(view_from_sequence("ACGTN")) + seqs.add_read(view_from_sequence("ACGTN")) # N does lead to a sample not being loaded. - assert seqdup.sampled_sequences == 1 + assert seqs.sampled_sequences == 1 From ae3a8a0b7f9180bf8d3a4c7181afb27860a10f4a Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 19 Nov 2024 16:34:15 +0100 Subject: [PATCH 2/7] Add facilities to set values for from beginning and end sampling --- src/sequali/__main__.py | 16 ++++++++++++++++ src/sequali/_qc.pyi | 7 ++++++- src/sequali/_qcmodule.c | 21 +++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py index ad3ca3b..68cab23 100644 --- a/src/sequali/__main__.py +++ b/src/sequali/__main__.py @@ -23,6 +23,8 @@ from ._qc import ( AdapterCounter, + DEFAULT_BASES_FROM_END, + DEFAULT_BASES_FROM_START, DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH, DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET, @@ -119,6 +121,20 @@ def argument_parser() -> argparse.ArgumentParser: f"gets filled up with more sequences from the " f"beginning. " f"Default: 1 in {DEFAULT_UNIQUE_SAMPLE_EVERY}.") + parser.add_argument("--overrepresentation-bases-from-start", type=int, + default=DEFAULT_BASES_FROM_START, + metavar="BP", + help=f"The minimum amount of bases sampled from the " + f"start of the read. There might be slight overshoot " + f"depending on the fragment length. " + f"Default: {DEFAULT_BASES_FROM_START}") + parser.add_argument("--overrepresentation-bases-from-end", type=int, + default=DEFAULT_BASES_FROM_END, + metavar="BP", + help=f"The minimum amount of bases sampled from the " + f"end of the read. There might be slight overshoot " + f"depending on the fragment length. " + f"Default: {DEFAULT_BASES_FROM_END}") parser.add_argument("--duplication-max-stored-fingerprints", type=int, default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS, metavar="N", diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi index 731bd2e..77fdaf8 100644 --- a/src/sequali/_qc.pyi +++ b/src/sequali/_qc.pyi @@ -32,6 +32,8 @@ DEFAULT_MAX_UNIQUE_FRAGMENTS: int DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS: int DEFAULT_FRAGMENT_LENGTH: int DEFAULT_UNIQUE_SAMPLE_EVERY: int +DEFAULT_BASES_FROM_START: int +DEFAULT_BASES_FROM_END: int DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH: int DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH: int DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET: int @@ -106,7 +108,10 @@ class OverrepresentedSequences: def __init__(self, max_unique_fragments: int = DEFAULT_MAX_UNIQUE_FRAGMENTS, fragment_length: int = DEFAULT_FRAGMENT_LENGTH, - sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY): ... + sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY, + bases_from_start: int = DEFAULT_BASES_FROM_START, + bases_from_end = DEFAULT_BASES_FROM_END, + ): ... def add_read(self, __read: FastqRecordView) -> None: ... def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ... def sequence_counts(self) -> Dict[str, int]: ... diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index add44e7..be070af 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3151,6 +3151,8 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence) #define DEFAULT_MAX_UNIQUE_FRAGMENTS 5000000 #define DEFAULT_FRAGMENT_LENGTH 21 #define DEFAULT_UNIQUE_SAMPLE_EVERY 8 +#define DEFAULT_BASES_FROM_START 100 +#define DEFAULT_BASES_FROM_END 100 typedef struct _OverrepresentedSequencesStruct { PyObject_HEAD @@ -3166,6 +3168,8 @@ typedef struct _OverrepresentedSequencesStruct { uint64_t number_of_unique_fragments; uint64_t total_fragments; size_t sample_every; + size_t bases_from_start; + size_t bases_from_end; } OverrepresentedSequences; static void @@ -3183,9 +3187,12 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw Py_ssize_t max_unique_fragments = DEFAULT_MAX_UNIQUE_FRAGMENTS; Py_ssize_t fragment_length = DEFAULT_FRAGMENT_LENGTH; Py_ssize_t sample_every = DEFAULT_UNIQUE_SAMPLE_EVERY; + Py_ssize_t bases_from_start = DEFAULT_BASES_FROM_START; + Py_ssize_t bases_from_end = DEFAULT_BASES_FROM_END; static char *kwargnames[] = {"max_unique_fragments", "fragment_length", - "sample_every", NULL}; - static char *format = "|nnn:OverrepresentedSequences"; + "sample_every", "bases_from_start", + "bases_from_end", NULL}; + static char *format = "|nnnnn:OverrepresentedSequences"; if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &max_unique_fragments, &fragment_length, &sample_every)) { @@ -3209,6 +3216,12 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw "sample_every must be 1 or greater. Got %zd", sample_every); return NULL; } + if (bases_from_start < 1) { + bases_from_start = UINT32_MAX; + } + if (bases_from_end < 1) { + bases_from_end = UINT32_MAX; + } /* If size is a power of 2, the modulo HASH_TABLE_SIZE can be optimised to a bitwise AND. Using 1.5 times as a base we ensure that the hashtable is utilized for at most 2/3. (Increased business degrades performance.) */ @@ -3239,6 +3252,8 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw self->hashes = hashes; self->counts = counts; self->sample_every = sample_every; + self->bases_from_start = bases_from_start; + self->bases_from_end = bases_from_end; return (PyObject *)self; } @@ -5233,6 +5248,8 @@ PyInit__qc(void) PyModule_AddIntMacro(m, DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS); PyModule_AddIntMacro(m, DEFAULT_FRAGMENT_LENGTH); PyModule_AddIntMacro(m, DEFAULT_UNIQUE_SAMPLE_EVERY); + PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_START); + PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_END); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET); From ea206581a3aa2faae4a31059276886cfac4d552f Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 19 Nov 2024 17:33:07 +0100 Subject: [PATCH 3/7] Implement sampling from extremeties --- src/sequali/_qcmodule.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index be070af..6acc731 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3168,8 +3168,8 @@ typedef struct _OverrepresentedSequencesStruct { uint64_t number_of_unique_fragments; uint64_t total_fragments; size_t sample_every; - size_t bases_from_start; - size_t bases_from_end; + Py_ssize_t fragments_from_start; + Py_ssize_t fragments_from_end; } OverrepresentedSequences; static void @@ -3252,8 +3252,10 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw self->hashes = hashes; self->counts = counts; self->sample_every = sample_every; - self->bases_from_start = bases_from_start; - self->bases_from_end = bases_from_end; + self->fragments_from_start = + (bases_from_start + fragment_length - 1) / fragment_length; + self->fragments_from_end = + (bases_from_end + fragment_length - 1) / fragment_length; return (PyObject *)self; } @@ -3358,9 +3360,28 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self, If the sequence length is exactly divisible by the fragment length, this results in exactly no overlap between front and back fragments, while still all of the sequence is being sampled. + + If the sequence is very large the amount of samples is taken is limited + by a user-settable maximum. + */ + /* Vader: Luke, Obi-Wan never told you about the algorithm... + Luke: He told me enough! It uses integer division! + Vader: Yes Luke, we have to use integer division. + Luke: No, that's not true! The compiler can optimize integer division + away for constants! + Vader: Search your feelings Luke! The fragment length has to be user + settable! + Luke: NOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO */ - Py_ssize_t total_fragments = + Py_ssize_t max_fragments = (sequence_length + fragment_length - 1) / fragment_length; + Py_ssize_t from_mid_point_fragments = max_fragments / 2; + Py_ssize_t max_start_fragments = max_fragments - from_mid_point_fragments; + Py_ssize_t fragments_from_start = + Py_MIN(self->fragments_from_start, max_start_fragments); + Py_ssize_t fragments_from_end = + Py_MIN(self->fragments_from_end, from_mid_point_fragments); + Py_ssize_t total_fragments = fragments_from_start + fragments_from_end; size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5)); size_t staging_hash_size = 1ULL << staging_hash_bits; if (staging_hash_size > self->staging_hash_table_size) { @@ -3371,12 +3392,12 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self, uint64_t *staging_hash_table = self->staging_hash_table; memset(staging_hash_table, 0, staging_hash_size * sizeof(uint64_t)); - Py_ssize_t from_mid_point_fragments = total_fragments / 2; - Py_ssize_t mid_point = - sequence_length - (from_mid_point_fragments * fragment_length); + Py_ssize_t start_end = fragments_from_start * fragment_length; + Py_ssize_t end_start = + sequence_length - (fragments_from_end * fragment_length); bool warn_unknown = false; // Sample front sequences - for (Py_ssize_t i = 0; i < mid_point; i += fragment_length) { + for (Py_ssize_t i = 0; i < start_end; i += fragment_length) { int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length); if (kmer < 0) { if (kmer == TWOBIT_UNKNOWN_CHAR) { @@ -3390,7 +3411,7 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self, } // Sample back sequences - for (Py_ssize_t i = mid_point; i < sequence_length; i += fragment_length) { + for (Py_ssize_t i = end_start; i < sequence_length; i += fragment_length) { int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length); if (kmer < 0) { if (kmer == TWOBIT_UNKNOWN_CHAR) { From 163103e3819f0472648829cab203298ed7307470 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 20 Nov 2024 10:45:04 +0100 Subject: [PATCH 4/7] Rename tests --- ...ation.py => test_overrepresented_sequences.py} | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) rename tests/{test_sequence_duplication.py => test_overrepresented_sequences.py} (94%) diff --git a/tests/test_sequence_duplication.py b/tests/test_overrepresented_sequences.py similarity index 94% rename from tests/test_sequence_duplication.py rename to tests/test_overrepresented_sequences.py index 0801ac7..7be86dc 100644 --- a/tests/test_sequence_duplication.py +++ b/tests/test_overrepresented_sequences.py @@ -30,7 +30,7 @@ def view_from_sequence(sequence: str) -> FastqRecordView: ) -def test_sequence_duplication(): +def test_overrepresented_sequences(): max_unique_fragments = 100_000 fragment_length = 31 seqdup = OverrepresentedSequences( @@ -60,7 +60,7 @@ def test_sequence_duplication(): assert sequence_counts[fragment_length * "A"] == 2 -def test_sequence_duplication_add_read_no_view(): +def test_overrepresented_sequences_add_read_no_view(): seqs = OverrepresentedSequences() with pytest.raises(TypeError) as error: seqs.add_read(b"ACGT") # type: ignore @@ -69,7 +69,8 @@ def test_sequence_duplication_add_read_no_view(): @pytest.mark.parametrize("threshold", [-0.1, 1.1]) -def test_sequence_duplication_overrepresented_sequences_faulty_threshold(threshold): +def test_overrepresented_sequences_overrepresented_sequences_faulty_threshold( + threshold): seqs = OverrepresentedSequences() with pytest.raises(ValueError) as error: seqs.overrepresented_sequences(threshold_fraction=threshold) @@ -79,7 +80,7 @@ def test_sequence_duplication_overrepresented_sequences_faulty_threshold(thresho error.match("1.0") -def test_sequence_duplication_overrepresented_sequences(): +def test_overrepresented_sequences_overrepresented_sequences(): fragment_length = 31 seqs = OverrepresentedSequences(sample_every=1, fragment_length=fragment_length) for i in range(100): @@ -118,7 +119,7 @@ def test_sequence_duplication_overrepresented_sequences(): assert overrepresented[1][2] == "C" * fragment_length -def test_sequence_duplication_case_insensitive(): +def test_overrepresented_sequences_case_insensitive(): fragment_length = 31 seqs = OverrepresentedSequences(fragment_length=fragment_length, sample_every=1) seqs.add_read(view_from_sequence("aaTTaca" * 5)) @@ -133,7 +134,7 @@ def test_sequence_duplication_case_insensitive(): @pytest.mark.parametrize("divisor", list(range(1, 21))) -def test_sequence_duplication_sampling_rate(divisor): +def test_overrepresented_sequences_sampling_rate(divisor): seqs = OverrepresentedSequences(sample_every=divisor) read = view_from_sequence("AAAA") number_of_sequences = 10_000 @@ -151,7 +152,7 @@ def test_sequence_duplication_sampling_rate(divisor): # Fragments that are duplicated in the sequence should only be recorded once. ("GATTACGATTAC", {"ATC": 1, "GTA": 1}), ]) -def test_sequence_duplication_all_fragments(sequence, result): +def test_overrepresented_sequences_all_fragments(sequence, result): seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) seqs.add_read(view_from_sequence(sequence)) seq_counts = seqs.sequence_counts() From 7a78762db722f2a9122ff43b88ec4d5e1c477b1b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 20 Nov 2024 11:03:04 +0100 Subject: [PATCH 5/7] Add a behavior test for overrepresented sequences sampling --- src/sequali/__main__.py | 6 ++++-- src/sequali/_qcmodule.c | 10 ++++----- tests/test_overrepresented_sequences.py | 27 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py index 68cab23..2dc171f 100644 --- a/src/sequali/__main__.py +++ b/src/sequali/__main__.py @@ -126,14 +126,16 @@ def argument_parser() -> argparse.ArgumentParser: metavar="BP", help=f"The minimum amount of bases sampled from the " f"start of the read. There might be slight overshoot " - f"depending on the fragment length. " + f"depending on the fragment length. Set to a " + f"negative value to sample the entire read. " f"Default: {DEFAULT_BASES_FROM_START}") parser.add_argument("--overrepresentation-bases-from-end", type=int, default=DEFAULT_BASES_FROM_END, metavar="BP", help=f"The minimum amount of bases sampled from the " f"end of the read. There might be slight overshoot " - f"depending on the fragment length. " + f"depending on the fragment length. Set to a " + f"negative value to sample the entire read. " f"Default: {DEFAULT_BASES_FROM_END}") parser.add_argument("--duplication-max-stored-fingerprints", type=int, default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS, diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index 6acc731..57e3033 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3193,9 +3193,9 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw "sample_every", "bases_from_start", "bases_from_end", NULL}; static char *format = "|nnnnn:OverrepresentedSequences"; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, - &max_unique_fragments, &fragment_length, - &sample_every)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwargs, format, kwargnames, &max_unique_fragments, + &fragment_length, &sample_every, &bases_from_start, &bases_from_end)) { return NULL; } if (max_unique_fragments < 1) { @@ -3216,10 +3216,10 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw "sample_every must be 1 or greater. Got %zd", sample_every); return NULL; } - if (bases_from_start < 1) { + if (bases_from_start < 0) { bases_from_start = UINT32_MAX; } - if (bases_from_end < 1) { + if (bases_from_end < 0) { bases_from_end = UINT32_MAX; } /* If size is a power of 2, the modulo HASH_TABLE_SIZE can be optimised to diff --git a/tests/test_overrepresented_sequences.py b/tests/test_overrepresented_sequences.py index 7be86dc..e7391b7 100644 --- a/tests/test_overrepresented_sequences.py +++ b/tests/test_overrepresented_sequences.py @@ -179,3 +179,30 @@ def test_valid_does_not_warn_for_n(): seqs.add_read(view_from_sequence("ACGTN")) # N does lead to a sample not being loaded. assert seqs.sampled_sequences == 1 + + +@pytest.mark.parametrize( + ["bases_from_start", "bases_from_end", "result"], [ + (0, 0, ()), + (1, 1, ("AAC", "CAA")), + (2, 2, ("AAC", "CAA")), + (3, 3, ("AAC", "CAA")), + (4, 4, ("AAC", "CAA", "CCG", "GCC")), + (1, 0, ("AAC",)), + (0, 1, ("CAA",)), + (100, 100, ("AAA", "AAC", "CAA", "CCG", "GCC")), + (-1, -1, ("AAA", "AAC", "CAA", "CCG", "GCC")), + ] +) +def test_overrepresented_sequences_sample_from_begin_and_end( + bases_from_start, bases_from_end, result): + seqs = OverrepresentedSequences( + fragment_length=3, + sample_every=1, + bases_from_start=bases_from_start, + bases_from_end=bases_from_end + ) + seqs.add_read(view_from_sequence("AACCGGTTTTGGCCAA")) + overrepresented = [x[2] for x in seqs.overrepresented_sequences(min_threshold=1)] + overrepresented.sort() + assert tuple(overrepresented) == result From f347ae23317fea9306f043f950272cc9bf75b71d Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 20 Nov 2024 11:07:03 +0100 Subject: [PATCH 6/7] Add overrepresented sequencing sampling to the changelog --- CHANGELOG.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 119675b..bb5614a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog .. This document is user facing. Please word the changes in such a way .. that users understand how the changes affect the new version. +version 0.13.0-dev +------------------ ++ Only sample the first 100 bp from the beginning and end of the read by + default for the overrepresented sequences analysis. This prevents a lot of + false positives from common human genome repeats. The amount of base pairs + that are sampled from the beginning and end is user settable with an option + to sample everything. + version 0.12.0 ------------------ + Properly name percentiles as such in the sequence length distribution rather From d4a06a27c40631d5aaa758968cc93aad9c6bf3d5 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 20 Nov 2024 11:25:40 +0100 Subject: [PATCH 7/7] Update documentation with changes to the overrepresented sequences algorithm --- .../images/overrepresented_sampling.fodg | 115 ++++++++--- .../images/overrepresented_sampling.svg | 181 ++++++++++++++---- docs/module_options.rst | 14 +- 3 files changed, 240 insertions(+), 70 deletions(-) diff --git a/docs/_static/images/overrepresented_sampling.fodg b/docs/_static/images/overrepresented_sampling.fodg index 27b19a0..26205c3 100644 --- a/docs/_static/images/overrepresented_sampling.fodg +++ b/docs/_static/images/overrepresented_sampling.fodg @@ -1,11 +1,11 @@ - Ruben Vorderman2024-03-27T12:13:53.658159835LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-22024-03-27T12:27:29.181257342Ruben VordermanPT13M36S3 + Ruben Vorderman2024-03-27T12:13:53.658159835LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-22024-11-20T11:17:18.804415312Ruben VordermanPT20M2S4 - -5024 - -849 + -1890 + -1872 44763 30807 @@ -35,10 +35,10 @@ true 4 0 - -5024 - -849 - 31530 - 21700 + -1890 + -1872 + 33617 + 21741 1000 1000 500 @@ -100,8 +100,8 @@ false false false - lAH+/0dlbmVyaWMgUHJpbnRlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAU0dFTlBSVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWAAMAtQAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9R2VuZXJpYyBQcmludGVyCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luYWRqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4dERhdGEKUGFnZVNpemU6QTQAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY= - Generic Printer + owH+/0hQX0VOVllfNzY0MF9zZXJpZXNfNEMzRDAzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpIUF9FTlZZXzc2NDBfc2VyaWVzXzRDM0QwMwAWAAMAxAAAAAAAAAAIAFZUAAAkbQAASm9iRGF0YSAxCnByaW50ZXI9SFBfRU5WWV83NjQwX3Nlcmllc180QzNEMDMKb3JpZW50YXRpb249UG9ydHJhaXQKY29waWVzPTEKY29sbGF0ZT1mYWxzZQptYXJnaW5hZGp1c3RtZW50PTAsMCwwLDAKY29sb3JkZXB0aD0yNApwc2xldmVsPTAKcGRmZGV2aWNlPTEKY29sb3JkZXZpY2U9MApQUERDb250ZXh0RGF0YQpQYWdlU2l6ZTpMZXR0ZXIAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY= + HP_ENVY_7640_series_4C3D03 1250 @@ -131,9 +131,12 @@ + + + - + @@ -291,7 +294,7 @@ - + @@ -312,20 +315,24 @@ + + + + - + - + - + - + @@ -387,7 +394,7 @@ - + Sequence #1 @@ -395,7 +402,7 @@ barcode - + adapter @@ -423,11 +430,11 @@ - + - + @@ -435,19 +442,15 @@ - - - - - + - + - + @@ -503,6 +506,66 @@ + + Sequence #3 + + + + barcode + + + + adapter + + + + adapter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/_static/images/overrepresented_sampling.svg b/docs/_static/images/overrepresented_sampling.svg index 3cf7716..45871bd 100644 --- a/docs/_static/images/overrepresented_sampling.svg +++ b/docs/_static/images/overrepresented_sampling.svg @@ -1,6 +1,6 @@ - + @@ -18,6 +18,7 @@ + @@ -59,9 +60,9 @@ - - - Sequence #1 + + + Sequence #1 @@ -73,9 +74,9 @@ - - - adapter + + + adapter @@ -122,16 +123,16 @@ - - - + + + - - - + + + @@ -143,122 +144,220 @@ - - - + + + - - - + + + - - - + + + - - - - - - - Sequence #2 - + barcode - + adapter - + adapter - + - + - + - + - + - + - + - + - + + + + + + Sequence #3 + + + + + + + barcode + + + + + + + adapter + + + + + + + adapter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/module_options.rst b/docs/module_options.rst index a4d2cc9..4fd53e7 100644 --- a/docs/module_options.rst +++ b/docs/module_options.rst @@ -53,6 +53,10 @@ ends towards the middle. This means that the first and last fragment will always be the first 21 bp of the beginning and the last 21 bp in the end. As such the adapter sequences will always be sampled in the same frame. +To prevent many common genome repeats from ending up in the report, Sequali +limits the amount of sampling inwards to approximately 100 bp. With the default +fragment size of 21 bp, this means that 5 fragments are taken from each side. + .. figure:: _static/images/overrepresented_sampling.svg This figure shows how fragments are sampled in sequali. The silver elements @@ -60,9 +64,9 @@ such the adapter sequences will always be sampled in the same frame. sampled. Despite the length differences between sequence #1 and sequence #2 the fragments for the adapter and barcode sequences are the same. In Sequence #1 the fragments sampled from the back end overlap somewhat - with sequences from the front end. This is a necessity to ensure all of the - sequence is sampled when the length is not divisible by the size of the - fragments. + with sequences from the front end. In sequence #3 the sequence is quite + long so the middle is not sampled. This prevents many common genome repeats + from ending up in the report. For each sequence only unique fragments within the sequence are sampled, to avoid overrepresenting common genomic repeats. @@ -89,6 +93,10 @@ The following command line parameters affect this module: store. + ``--overrepresentation-sample-every``: How often a sequence is sampled. Default is every 8 sequences. ++ ``--overrepresentation-bases-from-start``: the minimum amount of bases to + take from the start of the sequence. Default is 100. ++ ``--overrepresentation-bases-from-end``: the minimum amount of bases to + take from the end of the sequence. Default is 100. .. [#F1] A canonical k-mer is the k-mer that has the lowest sort order compared to itself and its reverse complement. This way the canonical-kmer is