From 4b4bc2e763c6ebebd2c81af16b9ab56ad9632ead Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 19 Nov 2024 16:15:15 +0100
Subject: [PATCH 1/7] Rename SequenceDuplication to OverrpresentedSequences

---
 src/sequali/__init__.py            |   4 +-
 src/sequali/__main__.py            |  16 ++--
 src/sequali/_qc.pyi                |   2 +-
 src/sequali/_qcmodule.c            | 127 +++++++++++++++--------------
 src/sequali/report_modules.py      |  19 +++--
 tests/test_sequence_duplication.py |  84 +++++++++----------
 6 files changed, 134 insertions(+), 118 deletions(-)

diff --git a/src/sequali/__init__.py b/src/sequali/__init__.py
index cea6b5d..4cfa811 100644
--- a/src/sequali/__init__.py
+++ b/src/sequali/__init__.py
@@ -17,7 +17,7 @@
 from ._qc import A, C, G, N, T
 from ._qc import (
     AdapterCounter, BamParser, FastqParser, FastqRecordArrayView,
-    FastqRecordView, PerTileQuality, QCMetrics, SequenceDuplication
+    FastqRecordView, OverrepresentedSequences, PerTileQuality, QCMetrics,
 )
 from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX, TABLE_SIZE
 from ._version import __version__
@@ -32,7 +32,7 @@
     "FastqRecordArrayView",
     "PerTileQuality",
     "QCMetrics",
-    "SequenceDuplication",
+    "OverrepresentedSequences",
     "NUMBER_OF_NUCS",
     "NUMBER_OF_PHREDS",
     "PHRED_MAX",
diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py
index 0c7568e..ad3ca3b 100644
--- a/src/sequali/__main__.py
+++ b/src/sequali/__main__.py
@@ -34,9 +34,9 @@
     DedupEstimator,
     InsertSizeMetrics,
     NanoStats,
+    OverrepresentedSequences,
     PerTileQuality,
     QCMetrics,
-    SequenceDuplication
 )
 from ._version import __version__
 from .adapters import DEFAULT_ADAPTER_FILE, adapters_from_file
@@ -188,7 +188,7 @@ def main() -> None:
     metrics1 = QCMetrics()
     per_tile_quality1 = PerTileQuality()
     nanostats1 = NanoStats()
-    sequence_duplication1 = SequenceDuplication(
+    overrepresented_sequences1 = OverrepresentedSequences(
         max_unique_fragments=args.overrepresentation_max_unique_fragments,
         fragment_length=args.overrepresentation_fragment_length,
         sample_every=args.overrepresentation_sample_every
@@ -219,7 +219,7 @@ def main() -> None:
         insert_size_metrics = InsertSizeMetrics()
         metrics2 = QCMetrics()
         per_tile_quality2 = PerTileQuality()
-        sequence_duplication2 = SequenceDuplication(
+        overrepresented_sequences2 = OverrepresentedSequences(
             max_unique_fragments=args.overrepresentation_max_unique_fragments,
             fragment_length=args.overrepresentation_fragment_length,
             sample_every=args.overrepresentation_sample_every
@@ -227,7 +227,7 @@ def main() -> None:
     else:
         metrics2 = None
         per_tile_quality2 = None
-        sequence_duplication2 = None
+        overrepresented_sequences2 = None
         insert_size_metrics = None
     with contextlib.ExitStack() as exit_stack:
         reader1 = NGSFile(args.input, threads - 1)
@@ -253,7 +253,7 @@ def main() -> None:
         for record_array1 in reader1:
             metrics1.add_record_array(record_array1)
             per_tile_quality1.add_record_array(record_array1)
-            sequence_duplication1.add_record_array(record_array1)
+            overrepresented_sequences1.add_record_array(record_array1)
             nanostats1.add_record_array(record_array1)
             if paired:
                 record_array2 = reader2.read(len(record_array1))
@@ -274,7 +274,7 @@ def main() -> None:
                 insert_size_metrics.add_record_array_pair(record_array1, record_array2)  # type: ignore  # noqa: E501
                 metrics2.add_record_array(record_array2)  # type: ignore  # noqa: E501
                 per_tile_quality2.add_record_array(record_array2)  # type: ignore  # noqa: E501
-                sequence_duplication2.add_record_array(record_array2)  # type: ignore  # noqa: E501
+                overrepresented_sequences2.add_record_array(record_array2)  # type: ignore  # noqa: E501
             else:
                 adapter_counter1.add_record_array(record_array1)   # type: ignore  # noqa: E501
                 dedup_estimator.add_record_array(record_array1)
@@ -289,14 +289,14 @@ def main() -> None:
         metrics=metrics1,
         adapter_counter=adapter_counter1,
         per_tile_quality=per_tile_quality1,
-        sequence_duplication=sequence_duplication1,
+        sequence_duplication=overrepresented_sequences1,
         dedup_estimator=dedup_estimator,
         nanostats=nanostats1,
         insert_size_metrics=insert_size_metrics,
         filename_reverse=args.input_reverse,
         metrics_reverse=metrics2,
         per_tile_quality_reverse=per_tile_quality2,
-        sequence_duplication_reverse=sequence_duplication2,
+        sequence_duplication_reverse=overrepresented_sequences2,
         adapters=adapters,
         fraction_threshold=fraction_threshold,
         min_threshold=min_threshold,
diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi
index 06fd9ba..731bd2e 100644
--- a/src/sequali/_qc.pyi
+++ b/src/sequali/_qc.pyi
@@ -94,7 +94,7 @@ class PerTileQuality:
     def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ...
     def get_tile_counts(self) -> List[Tuple[int, List[float], List[int]]]: ...
 
-class SequenceDuplication:
+class OverrepresentedSequences:
     number_of_sequences: int
     sampled_sequences: int
     collected_unique_fragments: int
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index bb8642e..add44e7 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3135,9 +3135,9 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence)
     }
 }
 
-/*************************
- * SEQUENCE DUPLICATION *
- *************************/
+/****************************
+ * OverrepresentedSequences *
+ ****************************/
 
 /* A module that cuts the sequence in bits of k size. The canonical (lowest)
    representation of the bit is used.
@@ -3152,7 +3152,7 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence)
 #define DEFAULT_FRAGMENT_LENGTH 21
 #define DEFAULT_UNIQUE_SAMPLE_EVERY 8
 
-typedef struct _SequenceDuplicationStruct {
+typedef struct _OverrepresentedSequencesStruct {
     PyObject_HEAD
     size_t fragment_length;
     uint64_t number_of_sequences;
@@ -3166,10 +3166,10 @@ typedef struct _SequenceDuplicationStruct {
     uint64_t number_of_unique_fragments;
     uint64_t total_fragments;
     size_t sample_every;
-} SequenceDuplication;
+} OverrepresentedSequences;
 
 static void
-SequenceDuplication_dealloc(SequenceDuplication *self)
+OverrepresentedSequences_dealloc(OverrepresentedSequences *self)
 {
     PyMem_Free(self->staging_hash_table);
     PyMem_Free(self->hashes);
@@ -3178,14 +3178,14 @@ SequenceDuplication_dealloc(SequenceDuplication *self)
 }
 
 static PyObject *
-SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
     Py_ssize_t max_unique_fragments = DEFAULT_MAX_UNIQUE_FRAGMENTS;
     Py_ssize_t fragment_length = DEFAULT_FRAGMENT_LENGTH;
     Py_ssize_t sample_every = DEFAULT_UNIQUE_SAMPLE_EVERY;
     static char *kwargnames[] = {"max_unique_fragments", "fragment_length",
                                  "sample_every", NULL};
-    static char *format = "|nnn:SequenceDuplication";
+    static char *format = "|nnn:OverrepresentedSequences";
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames,
                                      &max_unique_fragments, &fragment_length,
                                      &sample_every)) {
@@ -3221,7 +3221,7 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
         PyMem_Free(counts);
         return PyErr_NoMemory();
     }
-    SequenceDuplication *self = PyObject_New(SequenceDuplication, type);
+    OverrepresentedSequences *self = PyObject_New(OverrepresentedSequences, type);
     if (self == NULL) {
         PyMem_Free(hashes);
         PyMem_Free(counts);
@@ -3243,7 +3243,7 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 }
 
 static void
-Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
+Sequence_duplication_insert_hash(OverrepresentedSequences *self, uint64_t hash)
 {
     uint64_t hash_to_index_int = self->hash_table_size - 1;
     uint64_t *hashes = self->hashes;
@@ -3271,7 +3271,8 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
 }
 
 static int
-SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
+OverrepresentedSequences_resize_staging(OverrepresentedSequences *self,
+                                        uint64_t new_size)
 {
     if (new_size <= self->staging_hash_table_size) {
         return 0;
@@ -3310,7 +3311,8 @@ add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size,
 }
 
 static int
-SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
+OverrepresentedSequences_add_meta(OverrepresentedSequences *self,
+                                  struct FastqMeta *meta)
 {
     if (self->number_of_sequences % self->sample_every != 0) {
         self->number_of_sequences += 1;
@@ -3347,7 +3349,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
     size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
     size_t staging_hash_size = 1ULL << staging_hash_bits;
     if (staging_hash_size > self->staging_hash_table_size) {
-        if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
+        if (OverrepresentedSequences_resize_staging(self, staging_hash_size) < 0) {
             return -1;
         }
     }
@@ -3404,7 +3406,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
     return 0;
 }
 
-PyDoc_STRVAR(SequenceDuplication_add_read__doc__,
+PyDoc_STRVAR(OverrepresentedSequences_add_read__doc__,
              "add_read($self, read, /)\n"
              "--\n"
              "\n"
@@ -3413,10 +3415,11 @@ PyDoc_STRVAR(SequenceDuplication_add_read__doc__,
              "  read\n"
              "    A FastqRecordView object.\n");
 
-#define SequenceDuplication_add_read_method METH_O
+#define OverrepresentedSequences_add_read_method METH_O
 
 static PyObject *
-SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read)
+OverrepresentedSequences_add_read(OverrepresentedSequences *self,
+                                  FastqRecordView *read)
 {
     if (!FastqRecordView_CheckExact(read)) {
         PyErr_Format(PyExc_TypeError,
@@ -3424,13 +3427,13 @@ SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read)
                      Py_TYPE(read)->tp_name);
         return NULL;
     }
-    if (SequenceDuplication_add_meta(self, &read->meta) != 0) {
+    if (OverrepresentedSequences_add_meta(self, &read->meta) != 0) {
         return NULL;
     }
     Py_RETURN_NONE;
 }
 
-PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__,
+PyDoc_STRVAR(OverrepresentedSequences_add_record_array__doc__,
              "add_record_array($self, record_array, /)\n"
              "--\n"
              "\n"
@@ -3439,11 +3442,11 @@ PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__,
              "  record_array\n"
              "    A FastqRecordArrayView object.\n");
 
-#define SequenceDuplication_add_record_array_method METH_O
+#define OverrepresentedSequences_add_record_array_method METH_O
 
 static PyObject *
-SequenceDuplication_add_record_array(SequenceDuplication *self,
-                                     FastqRecordArrayView *record_array)
+OverrepresentedSequences_add_record_array(OverrepresentedSequences *self,
+                                          FastqRecordArrayView *record_array)
 {
     if (!FastqRecordArrayView_CheckExact(record_array)) {
         PyErr_Format(
@@ -3455,24 +3458,24 @@ SequenceDuplication_add_record_array(SequenceDuplication *self,
     Py_ssize_t number_of_records = Py_SIZE(record_array);
     struct FastqMeta *records = record_array->records;
     for (Py_ssize_t i = 0; i < number_of_records; i++) {
-        if (SequenceDuplication_add_meta(self, records + i) != 0) {
+        if (OverrepresentedSequences_add_meta(self, records + i) != 0) {
             return NULL;
         }
     }
     Py_RETURN_NONE;
 }
 
-PyDoc_STRVAR(SequenceDuplication_sequence_counts__doc__,
+PyDoc_STRVAR(OverrepresentedSequences_sequence_counts__doc__,
              "sequence_counts($self, /)\n"
              "--\n"
              "\n"
              "Get a dictionary with sequence counts \n");
 
-#define SequenceDuplication_sequence_counts_method METH_NOARGS
+#define OverrepresentedSequences_sequence_counts_method METH_NOARGS
 
 static PyObject *
-SequenceDuplication_sequence_counts(SequenceDuplication *self,
-                                    PyObject *Py_UNUSED(ignore))
+OverrepresentedSequences_sequence_counts(OverrepresentedSequences *self,
+                                         PyObject *Py_UNUSED(ignore))
 {
     PyObject *count_dict = PyDict_New();
     if (count_dict == NULL) {
@@ -3511,7 +3514,7 @@ SequenceDuplication_sequence_counts(SequenceDuplication *self,
 }
 
 PyDoc_STRVAR(
-    SequenceDuplication_overrepresented_sequences__doc__,
+    OverrepresentedSequences_overrepresented_sequences__doc__,
     "overrepresented_sequences($self, threshold=0.001)\n"
     "--\n"
     "\n"
@@ -3534,19 +3537,21 @@ PyDoc_STRVAR(
     "high "
     "    numbers of sequences.");
 
-#define SequenceDuplication_overrepresented_sequences_method \
+#define OverrepresentedSequences_overrepresented_sequences_method \
     METH_VARARGS | METH_KEYWORDS
 
 static PyObject *
-SequenceDuplication_overrepresented_sequences(SequenceDuplication *self,
-                                              PyObject *args, PyObject *kwargs)
+OverrepresentedSequences_overrepresented_sequences(OverrepresentedSequences *self,
+                                                   PyObject *args,
+                                                   PyObject *kwargs)
 {
     double threshold = 0.0001;  // 0.01 %
     Py_ssize_t min_threshold = 1;
     Py_ssize_t max_threshold = PY_SSIZE_T_MAX;
     static char *kwargnames[] = {"threshold_fraction", "min_threshold",
                                  "max_threshold", NULL};
-    static char *format = "|dnn:SequenceDuplication.overrepresented_sequences";
+    static char *format =
+        "|dnn:OverrepresentedSequences.overrepresented_sequences";
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &threshold,
                                      &min_threshold, &max_threshold)) {
         return NULL;
@@ -3627,51 +3632,53 @@ SequenceDuplication_overrepresented_sequences(SequenceDuplication *self,
     return NULL;
 }
 
-static PyMethodDef SequenceDuplication_methods[] = {
-    {"add_read", (PyCFunction)SequenceDuplication_add_read,
-     SequenceDuplication_add_read_method, SequenceDuplication_add_read__doc__},
-    {"add_record_array", (PyCFunction)SequenceDuplication_add_record_array,
-     SequenceDuplication_add_record_array_method,
-     SequenceDuplication_add_record_array__doc__},
-    {"sequence_counts", (PyCFunction)SequenceDuplication_sequence_counts,
-     SequenceDuplication_sequence_counts_method,
-     SequenceDuplication_sequence_counts__doc__},
+static PyMethodDef OverrepresentedSequences_methods[] = {
+    {"add_read", (PyCFunction)OverrepresentedSequences_add_read,
+     OverrepresentedSequences_add_read_method,
+     OverrepresentedSequences_add_read__doc__},
+    {"add_record_array", (PyCFunction)OverrepresentedSequences_add_record_array,
+     OverrepresentedSequences_add_record_array_method,
+     OverrepresentedSequences_add_record_array__doc__},
+    {"sequence_counts", (PyCFunction)OverrepresentedSequences_sequence_counts,
+     OverrepresentedSequences_sequence_counts_method,
+     OverrepresentedSequences_sequence_counts__doc__},
     {"overrepresented_sequences",
-     (PyCFunction)(void (*)(void))SequenceDuplication_overrepresented_sequences,
-     SequenceDuplication_overrepresented_sequences_method,
-     SequenceDuplication_overrepresented_sequences__doc__},
+     (PyCFunction)(void (*)(void))OverrepresentedSequences_overrepresented_sequences,
+     OverrepresentedSequences_overrepresented_sequences_method,
+     OverrepresentedSequences_overrepresented_sequences__doc__},
     {NULL},
 };
 
-static PyMemberDef SequenceDuplication_members[] = {
+static PyMemberDef OverrepresentedSequences_members[] = {
     {"number_of_sequences", T_ULONGLONG,
-     offsetof(SequenceDuplication, number_of_sequences), READONLY,
+     offsetof(OverrepresentedSequences, number_of_sequences), READONLY,
      "The total number of sequences submitted."},
     {"sampled_sequences", T_ULONGLONG,
-     offsetof(SequenceDuplication, sampled_sequences), READONLY,
+     offsetof(OverrepresentedSequences, sampled_sequences), READONLY,
      "The total number of sequences that were analysed."},
     {"collected_unique_fragments", T_ULONGLONG,
-     offsetof(SequenceDuplication, number_of_unique_fragments), READONLY,
+     offsetof(OverrepresentedSequences, number_of_unique_fragments), READONLY,
      "The number of unique fragments collected."},
     {"max_unique_fragments", T_ULONGLONG,
-     offsetof(SequenceDuplication, max_unique_fragments), READONLY,
+     offsetof(OverrepresentedSequences, max_unique_fragments), READONLY,
      "The maximum number of unique sequences stored in the object."},
-    {"fragment_length", T_BYTE, offsetof(SequenceDuplication, fragment_length),
+    {"fragment_length", T_BYTE, offsetof(OverrepresentedSequences, fragment_length),
      READONLY, "The length of the sampled sequences"},
-    {"sample_every", T_PYSSIZET, offsetof(SequenceDuplication, sample_every),
+    {"sample_every", T_PYSSIZET, offsetof(OverrepresentedSequences, sample_every),
      READONLY, "One in this many reads is sampled"},
-    {"total_fragments", T_ULONGLONG, offsetof(SequenceDuplication, total_fragments),
-     READONLY, "Total number of fragments."},
+    {"total_fragments", T_ULONGLONG,
+     offsetof(OverrepresentedSequences, total_fragments), READONLY,
+     "Total number of fragments."},
     {NULL},
 };
 
-static PyTypeObject SequenceDuplication_Type = {
-    .tp_name = "_qc.SequenceDuplication",
-    .tp_basicsize = sizeof(SequenceDuplication),
-    .tp_dealloc = (destructor)(SequenceDuplication_dealloc),
-    .tp_new = (newfunc)SequenceDuplication__new__,
-    .tp_members = SequenceDuplication_members,
-    .tp_methods = SequenceDuplication_methods,
+static PyTypeObject OverrepresentedSequences_Type = {
+    .tp_name = "_qc.OverrepresentedSequences",
+    .tp_basicsize = sizeof(OverrepresentedSequences),
+    .tp_dealloc = (destructor)(OverrepresentedSequences_dealloc),
+    .tp_new = (newfunc)OverrepresentedSequences__new__,
+    .tp_members = OverrepresentedSequences_members,
+    .tp_methods = OverrepresentedSequences_methods,
 };
 
 /*******************
@@ -5193,7 +5200,7 @@ PyInit__qc(void)
     if (python_module_add_type(m, &PerTileQuality_Type) != 0) {
         return NULL;
     }
-    if (python_module_add_type(m, &SequenceDuplication_Type) != 0) {
+    if (python_module_add_type(m, &OverrepresentedSequences_Type) != 0) {
         return NULL;
     }
     if (python_module_add_type(m, &DedupEstimator_Type) != 0) {
diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
index d539c73..0c9af2e 100644
--- a/src/sequali/report_modules.py
+++ b/src/sequali/report_modules.py
@@ -35,9 +35,16 @@
 import pygal.style  # type: ignore
 
 from ._qc import A, C, G, N, T
-from ._qc import (AdapterCounter, DedupEstimator,
-                  INSERT_SIZE_MAX_ADAPTER_STORE_SIZE, InsertSizeMetrics,
-                  NanoStats, PerTileQuality, QCMetrics, SequenceDuplication)
+from ._qc import (
+    AdapterCounter,
+    DedupEstimator,
+    INSERT_SIZE_MAX_ADAPTER_STORE_SIZE,
+    InsertSizeMetrics,
+    NanoStats,
+    OverrepresentedSequences,
+    PerTileQuality,
+    QCMetrics,
+)
 from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX
 from ._version import __version__
 from .adapters import Adapter
@@ -1495,7 +1502,7 @@ def to_html(self) -> str:
     @classmethod
     def from_sequence_duplication(
             cls,
-            seqdup: SequenceDuplication,
+            seqdup: OverrepresentedSequences,
             fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD,
             min_threshold: int = DEFAULT_MIN_THRESHOLD,
             max_threshold: int = DEFAULT_MAX_THRESHOLD,
@@ -2131,7 +2138,7 @@ def calculate_stats(
         filename: str,
         metrics: QCMetrics,
         per_tile_quality: PerTileQuality,
-        sequence_duplication: SequenceDuplication,
+        sequence_duplication: OverrepresentedSequences,
         dedup_estimator: DedupEstimator,
         nanostats: NanoStats,
         adapters: List[Adapter],
@@ -2140,7 +2147,7 @@ def calculate_stats(
         insert_size_metrics: Optional[InsertSizeMetrics] = None,
         metrics_reverse: Optional[QCMetrics] = None,
         per_tile_quality_reverse: Optional[PerTileQuality] = None,
-        sequence_duplication_reverse: Optional[SequenceDuplication] = None,
+        sequence_duplication_reverse: Optional[OverrepresentedSequences] = None,
         graph_resolution: int = 200,
         fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD,
         min_threshold: int = DEFAULT_MIN_THRESHOLD,
diff --git a/tests/test_sequence_duplication.py b/tests/test_sequence_duplication.py
index b7cee7f..0801ac7 100644
--- a/tests/test_sequence_duplication.py
+++ b/tests/test_sequence_duplication.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from sequali import FastqRecordView, SequenceDuplication
+from sequali import FastqRecordView, OverrepresentedSequences
 
 
 def view_from_sequence(sequence: str) -> FastqRecordView:
@@ -33,9 +33,11 @@ def view_from_sequence(sequence: str) -> FastqRecordView:
 def test_sequence_duplication():
     max_unique_fragments = 100_000
     fragment_length = 31
-    seqdup = SequenceDuplication(max_unique_fragments=max_unique_fragments,
-                                 fragment_length=fragment_length,
-                                 sample_every=1)
+    seqdup = OverrepresentedSequences(
+        max_unique_fragments=max_unique_fragments,
+        fragment_length=fragment_length,
+        sample_every=1
+    )
     # Create unique sequences by using all combinations of ACGT for the amount
     # of letters that is necessary to completely saturate the maximum unique
     # sequences
@@ -59,18 +61,18 @@ def test_sequence_duplication():
 
 
 def test_sequence_duplication_add_read_no_view():
-    seqdup = SequenceDuplication()
+    seqs = OverrepresentedSequences()
     with pytest.raises(TypeError) as error:
-        seqdup.add_read(b"ACGT")  # type: ignore
+        seqs.add_read(b"ACGT")  # type: ignore
     error.match("FastqRecordView")
     error.match("bytes")
 
 
 @pytest.mark.parametrize("threshold", [-0.1, 1.1])
 def test_sequence_duplication_overrepresented_sequences_faulty_threshold(threshold):
-    seqdup = SequenceDuplication()
+    seqs = OverrepresentedSequences()
     with pytest.raises(ValueError) as error:
-        seqdup.overrepresented_sequences(threshold_fraction=threshold)
+        seqs.overrepresented_sequences(threshold_fraction=threshold)
     error.match(str(threshold))
     error.match("between")
     error.match("0.0")
@@ -79,20 +81,20 @@ def test_sequence_duplication_overrepresented_sequences_faulty_threshold(thresho
 
 def test_sequence_duplication_overrepresented_sequences():
     fragment_length = 31
-    seqdup = SequenceDuplication(sample_every=1, fragment_length=fragment_length)
+    seqs = OverrepresentedSequences(sample_every=1, fragment_length=fragment_length)
     for i in range(100):
-        seqdup.add_read(view_from_sequence("A" * fragment_length))
+        seqs.add_read(view_from_sequence("A" * fragment_length))
     for i in range(200):
-        seqdup.add_read(view_from_sequence("C" * fragment_length))
+        seqs.add_read(view_from_sequence("C" * fragment_length))
     for i in range(2000):
-        seqdup.add_read(view_from_sequence("G" * fragment_length))
+        seqs.add_read(view_from_sequence("G" * fragment_length))
     for i in range(10):
-        seqdup.add_read(view_from_sequence("T" * fragment_length))
-    seqdup.add_read(view_from_sequence("C" * (fragment_length - 1) + "A"))
+        seqs.add_read(view_from_sequence("T" * fragment_length))
+    seqs.add_read(view_from_sequence("C" * (fragment_length - 1) + "A"))
     for i in range(100_000 - (100 + 200 + 2000 + 10 + 1)):
         # Count up to 100_000 to get nice fractions for all the sequences
-        seqdup.add_read(view_from_sequence("A" * (fragment_length - 1) + "C"))
-    overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.001)
+        seqs.add_read(view_from_sequence("A" * (fragment_length - 1) + "C"))
+    overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.001)
     assert overrepresented[0][2] == "A" * (fragment_length - 1) + "C"
     assert overrepresented[1][2] == "C" * fragment_length
     assert overrepresented[1][0] == 2200
@@ -100,14 +102,14 @@ def test_sequence_duplication_overrepresented_sequences():
     assert overrepresented[2][1] == 110 / 100_000
     # Assert no other sequences recorded as overrepresented.
     assert len(overrepresented) == 3
-    overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.00001)
+    overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.00001)
     assert overrepresented[-1][2] == "C" * (fragment_length - 1) + "A"
-    overrepresented = seqdup.overrepresented_sequences(
+    overrepresented = seqs.overrepresented_sequences(
         threshold_fraction=0.00001,
         min_threshold=2,
     )
     assert overrepresented[-1][2] == "A" * fragment_length
-    overrepresented = seqdup.overrepresented_sequences(
+    overrepresented = seqs.overrepresented_sequences(
         threshold_fraction=0.1,
         min_threshold=2,
         max_threshold=1000,
@@ -118,13 +120,13 @@ def test_sequence_duplication_overrepresented_sequences():
 
 def test_sequence_duplication_case_insensitive():
     fragment_length = 31
-    seqdup = SequenceDuplication(fragment_length=fragment_length, sample_every=1)
-    seqdup.add_read(view_from_sequence("aaTTaca" * 5))
-    seqdup.add_read(view_from_sequence("AAttACA" * 5))
-    seqcounts = seqdup.sequence_counts()
-    assert seqdup.number_of_sequences == 2
-    assert seqdup.total_fragments == 4
-    assert seqdup.collected_unique_fragments == 2
+    seqs = OverrepresentedSequences(fragment_length=fragment_length, sample_every=1)
+    seqs.add_read(view_from_sequence("aaTTaca" * 5))
+    seqs.add_read(view_from_sequence("AAttACA" * 5))
+    seqcounts = seqs.sequence_counts()
+    assert seqs.number_of_sequences == 2
+    assert seqs.total_fragments == 4
+    assert seqs.collected_unique_fragments == 2
     assert len(seqcounts) == 2
     assert seqcounts[("AATTACA" * 5)[:fragment_length]] == 2
     assert seqcounts[("AATTACA" * 5)[-fragment_length:]] == 2
@@ -132,13 +134,13 @@ def test_sequence_duplication_case_insensitive():
 
 @pytest.mark.parametrize("divisor", list(range(1, 21)))
 def test_sequence_duplication_sampling_rate(divisor):
-    seqdup = SequenceDuplication(sample_every=divisor)
+    seqs = OverrepresentedSequences(sample_every=divisor)
     read = view_from_sequence("AAAA")
     number_of_sequences = 10_000
     for i in range(number_of_sequences):
-        seqdup.add_read(read)
-    assert seqdup.number_of_sequences == number_of_sequences
-    assert seqdup.sampled_sequences == (number_of_sequences + divisor - 1) // divisor
+        seqs.add_read(read)
+    assert seqs.number_of_sequences == number_of_sequences
+    assert seqs.sampled_sequences == (number_of_sequences + divisor - 1) // divisor
 
 
 @pytest.mark.parametrize(["sequence", "result"], [
@@ -150,29 +152,29 @@ def test_sequence_duplication_sampling_rate(divisor):
     ("GATTACGATTAC", {"ATC": 1, "GTA": 1}),
 ])
 def test_sequence_duplication_all_fragments(sequence, result):
-    seqdup = SequenceDuplication(fragment_length=3, sample_every=1)
-    seqdup.add_read(view_from_sequence(sequence))
-    seq_counts = seqdup.sequence_counts()
+    seqs = OverrepresentedSequences(fragment_length=3, sample_every=1)
+    seqs.add_read(view_from_sequence(sequence))
+    seq_counts = seqs.sequence_counts()
     assert seq_counts == result
 
 
 def test_very_short_sequence():
     # With 32 byte load this will overflow the used memory.
-    seqdup = SequenceDuplication(fragment_length=3, sample_every=1)
-    seqdup.add_read(view_from_sequence("ACT"))
-    assert seqdup.sequence_counts() == {"ACT": 1}
+    seqs = OverrepresentedSequences(fragment_length=3, sample_every=1)
+    seqs.add_read(view_from_sequence("ACT"))
+    assert seqs.sequence_counts() == {"ACT": 1}
 
 
 def test_non_iupac_warning():
-    seqdup = SequenceDuplication(fragment_length=3, sample_every=1)
+    seqs = OverrepresentedSequences(fragment_length=3, sample_every=1)
     with pytest.warns(UserWarning, match="KKK"):
-        seqdup.add_read(view_from_sequence("KKK"))
+        seqs.add_read(view_from_sequence("KKK"))
 
 
 def test_valid_does_not_warn_for_n():
-    seqdup = SequenceDuplication(fragment_length=3, sample_every=1)
+    seqs = OverrepresentedSequences(fragment_length=3, sample_every=1)
     with warnings.catch_warnings():
         warnings.simplefilter("error")
-        seqdup.add_read(view_from_sequence("ACGTN"))
+        seqs.add_read(view_from_sequence("ACGTN"))
     # N does lead to a sample not being loaded.
-    assert seqdup.sampled_sequences == 1
+    assert seqs.sampled_sequences == 1

From ae3a8a0b7f9180bf8d3a4c7181afb27860a10f4a Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 19 Nov 2024 16:34:15 +0100
Subject: [PATCH 2/7] Add facilities to set values for from beginning and end
 sampling

---
 src/sequali/__main__.py | 16 ++++++++++++++++
 src/sequali/_qc.pyi     |  7 ++++++-
 src/sequali/_qcmodule.c | 21 +++++++++++++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py
index ad3ca3b..68cab23 100644
--- a/src/sequali/__main__.py
+++ b/src/sequali/__main__.py
@@ -23,6 +23,8 @@
 
 from ._qc import (
     AdapterCounter,
+    DEFAULT_BASES_FROM_END,
+    DEFAULT_BASES_FROM_START,
     DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS,
     DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH,
     DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET,
@@ -119,6 +121,20 @@ def argument_parser() -> argparse.ArgumentParser:
                              f"gets filled up with more sequences from the "
                              f"beginning. "
                              f"Default: 1 in {DEFAULT_UNIQUE_SAMPLE_EVERY}.")
+    parser.add_argument("--overrepresentation-bases-from-start", type=int,
+                        default=DEFAULT_BASES_FROM_START,
+                        metavar="BP",
+                        help=f"The minimum amount of bases sampled from the "
+                             f"start of the read. There might be slight overshoot "
+                             f"depending on the fragment length. "
+                             f"Default: {DEFAULT_BASES_FROM_START}")
+    parser.add_argument("--overrepresentation-bases-from-end", type=int,
+                        default=DEFAULT_BASES_FROM_END,
+                        metavar="BP",
+                        help=f"The minimum amount of bases sampled from the "
+                             f"end of the read. There might be slight overshoot "
+                             f"depending on the fragment length. "
+                             f"Default: {DEFAULT_BASES_FROM_END}")
     parser.add_argument("--duplication-max-stored-fingerprints", type=int,
                         default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS,
                         metavar="N",
diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi
index 731bd2e..77fdaf8 100644
--- a/src/sequali/_qc.pyi
+++ b/src/sequali/_qc.pyi
@@ -32,6 +32,8 @@ DEFAULT_MAX_UNIQUE_FRAGMENTS: int
 DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS: int
 DEFAULT_FRAGMENT_LENGTH: int
 DEFAULT_UNIQUE_SAMPLE_EVERY: int
+DEFAULT_BASES_FROM_START: int
+DEFAULT_BASES_FROM_END: int
 DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH: int
 DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH: int
 DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET: int
@@ -106,7 +108,10 @@ class OverrepresentedSequences:
     def __init__(self,
                  max_unique_fragments: int = DEFAULT_MAX_UNIQUE_FRAGMENTS,
                  fragment_length: int = DEFAULT_FRAGMENT_LENGTH,
-                 sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY): ...
+                 sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY,
+                 bases_from_start: int = DEFAULT_BASES_FROM_START,
+                 bases_from_end = DEFAULT_BASES_FROM_END,
+    ): ...
     def add_read(self, __read: FastqRecordView) -> None: ...
     def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ...
     def sequence_counts(self) -> Dict[str, int]: ...
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index add44e7..be070af 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3151,6 +3151,8 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence)
 #define DEFAULT_MAX_UNIQUE_FRAGMENTS 5000000
 #define DEFAULT_FRAGMENT_LENGTH 21
 #define DEFAULT_UNIQUE_SAMPLE_EVERY 8
+#define DEFAULT_BASES_FROM_START 100
+#define DEFAULT_BASES_FROM_END 100
 
 typedef struct _OverrepresentedSequencesStruct {
     PyObject_HEAD
@@ -3166,6 +3168,8 @@ typedef struct _OverrepresentedSequencesStruct {
     uint64_t number_of_unique_fragments;
     uint64_t total_fragments;
     size_t sample_every;
+    size_t bases_from_start;
+    size_t bases_from_end;
 } OverrepresentedSequences;
 
 static void
@@ -3183,9 +3187,12 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
     Py_ssize_t max_unique_fragments = DEFAULT_MAX_UNIQUE_FRAGMENTS;
     Py_ssize_t fragment_length = DEFAULT_FRAGMENT_LENGTH;
     Py_ssize_t sample_every = DEFAULT_UNIQUE_SAMPLE_EVERY;
+    Py_ssize_t bases_from_start = DEFAULT_BASES_FROM_START;
+    Py_ssize_t bases_from_end = DEFAULT_BASES_FROM_END;
     static char *kwargnames[] = {"max_unique_fragments", "fragment_length",
-                                 "sample_every", NULL};
-    static char *format = "|nnn:OverrepresentedSequences";
+                                 "sample_every",         "bases_from_start",
+                                 "bases_from_end",       NULL};
+    static char *format = "|nnnnn:OverrepresentedSequences";
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames,
                                      &max_unique_fragments, &fragment_length,
                                      &sample_every)) {
@@ -3209,6 +3216,12 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
                      "sample_every must be 1 or greater. Got %zd", sample_every);
         return NULL;
     }
+    if (bases_from_start < 1) {
+        bases_from_start = UINT32_MAX;
+    }
+    if (bases_from_end < 1) {
+        bases_from_end = UINT32_MAX;
+    }
     /* If size is a power of 2, the modulo HASH_TABLE_SIZE can be optimised to
        a bitwise AND. Using 1.5 times as a base we ensure that the hashtable is
        utilized for at most 2/3. (Increased business degrades performance.) */
@@ -3239,6 +3252,8 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
     self->hashes = hashes;
     self->counts = counts;
     self->sample_every = sample_every;
+    self->bases_from_start = bases_from_start;
+    self->bases_from_end = bases_from_end;
     return (PyObject *)self;
 }
 
@@ -5233,6 +5248,8 @@ PyInit__qc(void)
     PyModule_AddIntMacro(m, DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS);
     PyModule_AddIntMacro(m, DEFAULT_FRAGMENT_LENGTH);
     PyModule_AddIntMacro(m, DEFAULT_UNIQUE_SAMPLE_EVERY);
+    PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_START);
+    PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_END);
     PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH);
     PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH);
     PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET);

From ea206581a3aa2faae4a31059276886cfac4d552f Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 19 Nov 2024 17:33:07 +0100
Subject: [PATCH 3/7] Implement sampling from extremeties

---
 src/sequali/_qcmodule.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index be070af..6acc731 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3168,8 +3168,8 @@ typedef struct _OverrepresentedSequencesStruct {
     uint64_t number_of_unique_fragments;
     uint64_t total_fragments;
     size_t sample_every;
-    size_t bases_from_start;
-    size_t bases_from_end;
+    Py_ssize_t fragments_from_start;
+    Py_ssize_t fragments_from_end;
 } OverrepresentedSequences;
 
 static void
@@ -3252,8 +3252,10 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
     self->hashes = hashes;
     self->counts = counts;
     self->sample_every = sample_every;
-    self->bases_from_start = bases_from_start;
-    self->bases_from_end = bases_from_end;
+    self->fragments_from_start =
+        (bases_from_start + fragment_length - 1) / fragment_length;
+    self->fragments_from_end =
+        (bases_from_end + fragment_length - 1) / fragment_length;
     return (PyObject *)self;
 }
 
@@ -3358,9 +3360,28 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self,
        If the sequence length is exactly divisible by the fragment length, this
        results in exactly no overlap between front and back fragments, while
        still all of the sequence is being sampled.
+
+       If the sequence is very large the amount of samples is taken is limited
+       by a user-settable maximum.
+    */
+    /* Vader: Luke, Obi-Wan never told you about the algorithm...
+       Luke:  He told me enough! It uses integer division!
+       Vader: Yes Luke, we have to use integer division.
+       Luke:  No, that's not true! The compiler can optimize integer division
+              away for constants!
+       Vader: Search your feelings Luke! The fragment length has to be user
+              settable!
+       Luke:  NOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
     */
-    Py_ssize_t total_fragments =
+    Py_ssize_t max_fragments =
         (sequence_length + fragment_length - 1) / fragment_length;
+    Py_ssize_t from_mid_point_fragments = max_fragments / 2;
+    Py_ssize_t max_start_fragments = max_fragments - from_mid_point_fragments;
+    Py_ssize_t fragments_from_start =
+        Py_MIN(self->fragments_from_start, max_start_fragments);
+    Py_ssize_t fragments_from_end =
+        Py_MIN(self->fragments_from_end, from_mid_point_fragments);
+    Py_ssize_t total_fragments = fragments_from_start + fragments_from_end;
     size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
     size_t staging_hash_size = 1ULL << staging_hash_bits;
     if (staging_hash_size > self->staging_hash_table_size) {
@@ -3371,12 +3392,12 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self,
     uint64_t *staging_hash_table = self->staging_hash_table;
     memset(staging_hash_table, 0, staging_hash_size * sizeof(uint64_t));
 
-    Py_ssize_t from_mid_point_fragments = total_fragments / 2;
-    Py_ssize_t mid_point =
-        sequence_length - (from_mid_point_fragments * fragment_length);
+    Py_ssize_t start_end = fragments_from_start * fragment_length;
+    Py_ssize_t end_start =
+        sequence_length - (fragments_from_end * fragment_length);
     bool warn_unknown = false;
     // Sample front sequences
-    for (Py_ssize_t i = 0; i < mid_point; i += fragment_length) {
+    for (Py_ssize_t i = 0; i < start_end; i += fragment_length) {
         int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length);
         if (kmer < 0) {
             if (kmer == TWOBIT_UNKNOWN_CHAR) {
@@ -3390,7 +3411,7 @@ OverrepresentedSequences_add_meta(OverrepresentedSequences *self,
     }
 
     // Sample back sequences
-    for (Py_ssize_t i = mid_point; i < sequence_length; i += fragment_length) {
+    for (Py_ssize_t i = end_start; i < sequence_length; i += fragment_length) {
         int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length);
         if (kmer < 0) {
             if (kmer == TWOBIT_UNKNOWN_CHAR) {

From 163103e3819f0472648829cab203298ed7307470 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Wed, 20 Nov 2024 10:45:04 +0100
Subject: [PATCH 4/7] Rename tests

---
 ...ation.py => test_overrepresented_sequences.py} | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)
 rename tests/{test_sequence_duplication.py => test_overrepresented_sequences.py} (94%)

diff --git a/tests/test_sequence_duplication.py b/tests/test_overrepresented_sequences.py
similarity index 94%
rename from tests/test_sequence_duplication.py
rename to tests/test_overrepresented_sequences.py
index 0801ac7..7be86dc 100644
--- a/tests/test_sequence_duplication.py
+++ b/tests/test_overrepresented_sequences.py
@@ -30,7 +30,7 @@ def view_from_sequence(sequence: str) -> FastqRecordView:
     )
 
 
-def test_sequence_duplication():
+def test_overrepresented_sequences():
     max_unique_fragments = 100_000
     fragment_length = 31
     seqdup = OverrepresentedSequences(
@@ -60,7 +60,7 @@ def test_sequence_duplication():
     assert sequence_counts[fragment_length * "A"] == 2
 
 
-def test_sequence_duplication_add_read_no_view():
+def test_overrepresented_sequences_add_read_no_view():
     seqs = OverrepresentedSequences()
     with pytest.raises(TypeError) as error:
         seqs.add_read(b"ACGT")  # type: ignore
@@ -69,7 +69,8 @@ def test_sequence_duplication_add_read_no_view():
 
 
 @pytest.mark.parametrize("threshold", [-0.1, 1.1])
-def test_sequence_duplication_overrepresented_sequences_faulty_threshold(threshold):
+def test_overrepresented_sequences_overrepresented_sequences_faulty_threshold(
+        threshold):
     seqs = OverrepresentedSequences()
     with pytest.raises(ValueError) as error:
         seqs.overrepresented_sequences(threshold_fraction=threshold)
@@ -79,7 +80,7 @@ def test_sequence_duplication_overrepresented_sequences_faulty_threshold(thresho
     error.match("1.0")
 
 
-def test_sequence_duplication_overrepresented_sequences():
+def test_overrepresented_sequences_overrepresented_sequences():
     fragment_length = 31
     seqs = OverrepresentedSequences(sample_every=1, fragment_length=fragment_length)
     for i in range(100):
@@ -118,7 +119,7 @@ def test_sequence_duplication_overrepresented_sequences():
     assert overrepresented[1][2] == "C" * fragment_length
 
 
-def test_sequence_duplication_case_insensitive():
+def test_overrepresented_sequences_case_insensitive():
     fragment_length = 31
     seqs = OverrepresentedSequences(fragment_length=fragment_length, sample_every=1)
     seqs.add_read(view_from_sequence("aaTTaca" * 5))
@@ -133,7 +134,7 @@ def test_sequence_duplication_case_insensitive():
 
 
 @pytest.mark.parametrize("divisor", list(range(1, 21)))
-def test_sequence_duplication_sampling_rate(divisor):
+def test_overrepresented_sequences_sampling_rate(divisor):
     seqs = OverrepresentedSequences(sample_every=divisor)
     read = view_from_sequence("AAAA")
     number_of_sequences = 10_000
@@ -151,7 +152,7 @@ def test_sequence_duplication_sampling_rate(divisor):
     # Fragments that are duplicated in the sequence should only be recorded once.
     ("GATTACGATTAC", {"ATC": 1, "GTA": 1}),
 ])
-def test_sequence_duplication_all_fragments(sequence, result):
+def test_overrepresented_sequences_all_fragments(sequence, result):
     seqs = OverrepresentedSequences(fragment_length=3, sample_every=1)
     seqs.add_read(view_from_sequence(sequence))
     seq_counts = seqs.sequence_counts()

From 7a78762db722f2a9122ff43b88ec4d5e1c477b1b Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Wed, 20 Nov 2024 11:03:04 +0100
Subject: [PATCH 5/7] Add a behavior test for overrepresented sequences
 sampling

---
 src/sequali/__main__.py                 |  6 ++++--
 src/sequali/_qcmodule.c                 | 10 ++++-----
 tests/test_overrepresented_sequences.py | 27 +++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py
index 68cab23..2dc171f 100644
--- a/src/sequali/__main__.py
+++ b/src/sequali/__main__.py
@@ -126,14 +126,16 @@ def argument_parser() -> argparse.ArgumentParser:
                         metavar="BP",
                         help=f"The minimum amount of bases sampled from the "
                              f"start of the read. There might be slight overshoot "
-                             f"depending on the fragment length. "
+                             f"depending on the fragment length. Set to a "
+                             f"negative value to sample the entire read. "
                              f"Default: {DEFAULT_BASES_FROM_START}")
     parser.add_argument("--overrepresentation-bases-from-end", type=int,
                         default=DEFAULT_BASES_FROM_END,
                         metavar="BP",
                         help=f"The minimum amount of bases sampled from the "
                              f"end of the read. There might be slight overshoot "
-                             f"depending on the fragment length. "
+                             f"depending on the fragment length. Set to a "
+                             f"negative value to sample the entire read. "
                              f"Default: {DEFAULT_BASES_FROM_END}")
     parser.add_argument("--duplication-max-stored-fingerprints", type=int,
                         default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS,
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index 6acc731..57e3033 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -3193,9 +3193,9 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
                                  "sample_every",         "bases_from_start",
                                  "bases_from_end",       NULL};
     static char *format = "|nnnnn:OverrepresentedSequences";
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames,
-                                     &max_unique_fragments, &fragment_length,
-                                     &sample_every)) {
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwargs, format, kwargnames, &max_unique_fragments,
+            &fragment_length, &sample_every, &bases_from_start, &bases_from_end)) {
         return NULL;
     }
     if (max_unique_fragments < 1) {
@@ -3216,10 +3216,10 @@ OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kw
                      "sample_every must be 1 or greater. Got %zd", sample_every);
         return NULL;
     }
-    if (bases_from_start < 1) {
+    if (bases_from_start < 0) {
         bases_from_start = UINT32_MAX;
     }
-    if (bases_from_end < 1) {
+    if (bases_from_end < 0) {
         bases_from_end = UINT32_MAX;
     }
     /* If size is a power of 2, the modulo HASH_TABLE_SIZE can be optimised to
diff --git a/tests/test_overrepresented_sequences.py b/tests/test_overrepresented_sequences.py
index 7be86dc..e7391b7 100644
--- a/tests/test_overrepresented_sequences.py
+++ b/tests/test_overrepresented_sequences.py
@@ -179,3 +179,30 @@ def test_valid_does_not_warn_for_n():
         seqs.add_read(view_from_sequence("ACGTN"))
     # N does lead to a sample not being loaded.
     assert seqs.sampled_sequences == 1
+
+
+@pytest.mark.parametrize(
+    ["bases_from_start", "bases_from_end", "result"], [
+        (0, 0, ()),
+        (1, 1, ("AAC", "CAA")),
+        (2, 2, ("AAC", "CAA")),
+        (3, 3, ("AAC", "CAA")),
+        (4, 4, ("AAC", "CAA", "CCG", "GCC")),
+        (1, 0, ("AAC",)),
+        (0, 1, ("CAA",)),
+        (100, 100, ("AAA", "AAC", "CAA", "CCG", "GCC")),
+        (-1, -1, ("AAA", "AAC", "CAA", "CCG", "GCC")),
+    ]
+)
+def test_overrepresented_sequences_sample_from_begin_and_end(
+        bases_from_start, bases_from_end, result):
+    seqs = OverrepresentedSequences(
+        fragment_length=3,
+        sample_every=1,
+        bases_from_start=bases_from_start,
+        bases_from_end=bases_from_end
+    )
+    seqs.add_read(view_from_sequence("AACCGGTTTTGGCCAA"))
+    overrepresented = [x[2] for x in seqs.overrepresented_sequences(min_threshold=1)]
+    overrepresented.sort()
+    assert tuple(overrepresented) == result

From f347ae23317fea9306f043f950272cc9bf75b71d Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Wed, 20 Nov 2024 11:07:03 +0100
Subject: [PATCH 6/7] Add overrepresented sequencing sampling to the changelog

---
 CHANGELOG.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 119675b..bb5614a 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,14 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.13.0-dev
+------------------
++ Only sample the first 100 bp from the beginning and end of the read by
+  default for the overrepresented sequences analysis. This prevents a lot of
+  false positives from common human genome repeats. The amount of base pairs
+  that are sampled from the beginning and end is user settable with an option
+  to sample everything.
+
 version 0.12.0
 ------------------
 + Properly name percentiles as such in the sequence length distribution rather

From d4a06a27c40631d5aaa758968cc93aad9c6bf3d5 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Wed, 20 Nov 2024 11:25:40 +0100
Subject: [PATCH 7/7] Update documentation with changes to the overrepresented
 sequences algorithm

---
 .../images/overrepresented_sampling.fodg      | 115 ++++++++---
 .../images/overrepresented_sampling.svg       | 181 ++++++++++++++----
 docs/module_options.rst                       |  14 +-
 3 files changed, 240 insertions(+), 70 deletions(-)

diff --git a/docs/_static/images/overrepresented_sampling.fodg b/docs/_static/images/overrepresented_sampling.fodg
index 27b19a0..26205c3 100644
--- a/docs/_static/images/overrepresented_sampling.fodg
+++ b/docs/_static/images/overrepresented_sampling.fodg
@@ -1,11 +1,11 @@
 <?xml version="1.0" encoding="UTF-8"?>
 
 <office:document xmlns:anim="urn:oasis:names:tc:opendocument:xmlns:animation:1.0" xmlns:smil="urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0" xmlns:presentation="urn:oasis:names:tc:opendocument:xmlns:presentation:1.0" xmlns:css3t="http://www.w3.org/TR/css3-text/" xmlns:grddl="http://www.w3.org/2003/g/data-view#" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xforms="http://www.w3.org/2002/xforms" xmlns:dom="http://www.w3.org/2001/xml-events" xmlns:script="urn:oasis:names:tc:opendocument:xmlns:script:1.0" xmlns:form="urn:oasis:names:tc:opendocument:xmlns:form:1.0" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" xmlns:ooo="http://openoffice.org/2004/office" xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0" xmlns:config="urn:oasis:names:tc:opendocument:xmlns:config:1.0" xmlns:ooow="http://openoffice.org/2004/writer" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:drawooo="http://openoffice.org/2010/draw" xmlns:oooc="http://openoffice.org/2004/calc" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:calcext="urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0" xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0" xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" xmlns:of="urn:oasis:names:tc:opendocument:xmlns:of:1.2" xmlns:tableooo="http://openoffice.org/2009/table" xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" xmlns:dr3d="urn:oasis:names:tc:opendocument:xmlns:dr3d:1.0" xmlns:rpt="http://openoffice.org/2005/report" xmlns:formx="urn:openoffice:names:experimental:ooxml-odf-interop:xmlns:form:1.0" xmlns:svg="urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0" xmlns:chart="urn:oasis:names:tc:opendocument:xmlns:chart:1.0" xmlns:officeooo="http://openoffice.org/2009/office" xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0" xmlns:field="urn:openoffice:names:experimental:ooo-ms-interop:xmlns:field:1.0" xmlns:number="urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0" xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" xmlns:loext="urn:org:documentfoundation:names:experimental:office:xmlns:loext:1.0" office:version="1.3" office:mimetype="application/vnd.oasis.opendocument.graphics">
- <office:meta><meta:initial-creator>Ruben Vorderman</meta:initial-creator><meta:creation-date>2024-03-27T12:13:53.658159835</meta:creation-date><meta:generator>LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-2</meta:generator><dc:date>2024-03-27T12:27:29.181257342</dc:date><dc:creator>Ruben Vorderman</dc:creator><meta:editing-duration>PT13M36S</meta:editing-duration><meta:editing-cycles>3</meta:editing-cycles><meta:document-statistic meta:object-count="29"/></office:meta>
+ <office:meta><meta:initial-creator>Ruben Vorderman</meta:initial-creator><meta:creation-date>2024-03-27T12:13:53.658159835</meta:creation-date><meta:generator>LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-2</meta:generator><dc:date>2024-11-20T11:17:18.804415312</dc:date><dc:creator>Ruben Vorderman</dc:creator><meta:editing-duration>PT20M2S</meta:editing-duration><meta:editing-cycles>4</meta:editing-cycles><meta:document-statistic meta:object-count="43"/></office:meta>
  <office:settings>
   <config:config-item-set config:name="ooo:view-settings">
-   <config:config-item config:name="VisibleAreaTop" config:type="int">-5024</config:config-item>
-   <config:config-item config:name="VisibleAreaLeft" config:type="int">-849</config:config-item>
+   <config:config-item config:name="VisibleAreaTop" config:type="int">-1890</config:config-item>
+   <config:config-item config:name="VisibleAreaLeft" config:type="int">-1872</config:config-item>
    <config:config-item config:name="VisibleAreaWidth" config:type="int">44763</config:config-item>
    <config:config-item config:name="VisibleAreaHeight" config:type="int">30807</config:config-item>
    <config:config-item-map-indexed config:name="Views">
@@ -35,10 +35,10 @@
      <config:config-item config:name="IsClickChangeRotation" config:type="boolean">true</config:config-item>
      <config:config-item config:name="SlidesPerRow" config:type="short">4</config:config-item>
      <config:config-item config:name="EditMode" config:type="int">0</config:config-item>
-     <config:config-item config:name="VisibleAreaTop" config:type="int">-5024</config:config-item>
-     <config:config-item config:name="VisibleAreaLeft" config:type="int">-849</config:config-item>
-     <config:config-item config:name="VisibleAreaWidth" config:type="int">31530</config:config-item>
-     <config:config-item config:name="VisibleAreaHeight" config:type="int">21700</config:config-item>
+     <config:config-item config:name="VisibleAreaTop" config:type="int">-1890</config:config-item>
+     <config:config-item config:name="VisibleAreaLeft" config:type="int">-1872</config:config-item>
+     <config:config-item config:name="VisibleAreaWidth" config:type="int">33617</config:config-item>
+     <config:config-item config:name="VisibleAreaHeight" config:type="int">21741</config:config-item>
      <config:config-item config:name="GridCoarseWidth" config:type="int">1000</config:config-item>
      <config:config-item config:name="GridCoarseHeight" config:type="int">1000</config:config-item>
      <config:config-item config:name="GridFineWidth" config:type="int">500</config:config-item>
@@ -100,8 +100,8 @@
    <config:config-item config:name="IsKernAsianPunctuation" config:type="boolean">false</config:config-item>
    <config:config-item config:name="PrinterPaperFromSetup" config:type="boolean">false</config:config-item>
    <config:config-item config:name="ParagraphSummation" config:type="boolean">false</config:config-item>
-   <config:config-item config:name="PrinterSetup" config:type="base64Binary">lAH+/0dlbmVyaWMgUHJpbnRlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAU0dFTlBSVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWAAMAtQAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9R2VuZXJpYyBQcmludGVyCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luYWRqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4dERhdGEKUGFnZVNpemU6QTQAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY=</config:config-item>
-   <config:config-item config:name="PrinterName" config:type="string">Generic Printer</config:config-item>
+   <config:config-item config:name="PrinterSetup" config:type="base64Binary">owH+/0hQX0VOVllfNzY0MF9zZXJpZXNfNEMzRDAzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpIUF9FTlZZXzc2NDBfc2VyaWVzXzRDM0QwMwAWAAMAxAAAAAAAAAAIAFZUAAAkbQAASm9iRGF0YSAxCnByaW50ZXI9SFBfRU5WWV83NjQwX3Nlcmllc180QzNEMDMKb3JpZW50YXRpb249UG9ydHJhaXQKY29waWVzPTEKY29sbGF0ZT1mYWxzZQptYXJnaW5hZGp1c3RtZW50PTAsMCwwLDAKY29sb3JkZXB0aD0yNApwc2xldmVsPTAKcGRmZGV2aWNlPTEKY29sb3JkZXZpY2U9MApQUERDb250ZXh0RGF0YQpQYWdlU2l6ZTpMZXR0ZXIAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY=</config:config-item>
+   <config:config-item config:name="PrinterName" config:type="string">HP_ENVY_7640_series_4C3D03</config:config-item>
    <config:config-item config:name="DefaultTabStop" config:type="int">1250</config:config-item>
   </config:config-item-set>
  </office:settings>
@@ -131,9 +131,12 @@
   <draw:hatch draw:name="Black_20_90_20_Degrees" draw:display-name="Black 90 Degrees" draw:style="single" draw:color="#000000" draw:distance="0.102cm" draw:rotation="900"/>
   <draw:hatch draw:name="Blue_20_45_20_Degrees" draw:display-name="Blue 45 Degrees" draw:style="single" draw:color="#2a6099" draw:distance="0.203cm" draw:rotation="450"/>
   <draw:marker draw:name="Arrow" svg:viewBox="0 0 20 30" svg:d="M10 0l-10 30h20z"/>
+  <draw:stroke-dash draw:name="Dot_20__28_Rounded_29_" draw:display-name="Dot (Rounded)" draw:style="round" draw:dots1="1" draw:dots1-length="201%" draw:distance="199%"/>
+  <draw:stroke-dash draw:name="Long_20_Dash" draw:display-name="Long Dash" draw:style="rect" draw:dots1="1" draw:dots1-length="400%" draw:distance="300%"/>
+  <draw:stroke-dash draw:name="Long_20_Dot" draw:display-name="Long Dot" draw:style="rect" draw:dots1="1" draw:dots1-length="100%" draw:distance="300%"/>
   <style:default-style style:family="graphic">
    <style:graphic-properties svg:stroke-color="#3465a4" draw:fill-color="#729fcf" fo:wrap-option="no-wrap"/>
-   <style:paragraph-properties style:text-autospace="ideograph-alpha" style:punctuation-wrap="simple" style:line-break="strict" style:writing-mode="lr-tb" style:font-independent-line-spacing="false">
+   <style:paragraph-properties style:text-autospace="ideograph-alpha" style:punctuation-wrap="simple" style:line-break="strict" style:font-independent-line-spacing="false">
     <style:tab-stops/>
    </style:paragraph-properties>
    <style:text-properties style:use-window-font-color="true" loext:opacity="0%" loext:color-lum-mod="100%" loext:color-lum-off="0%" style:font-name="Liberation Serif" fo:font-size="24pt" fo:language="en" fo:country="US" style:font-name-asian="DejaVu Sans" style:font-size-asian="24pt" style:language-asian="zh" style:country-asian="CN" style:font-name-complex="DejaVu Sans" style:font-size-complex="24pt" style:language-complex="hi" style:country-complex="IN"/>
@@ -291,7 +294,7 @@
   </style:style>
   <style:style style:name="dp2" style:family="drawing-page"/>
   <style:style style:name="gr1" style:family="graphic" style:parent-style-name="standard">
-   <style:graphic-properties draw:stroke="none" draw:fill="solid" draw:fill-color="#6495ed" loext:fill-use-slide-background="false" draw:textarea-horizontal-align="justify" draw:textarea-vertical-align="middle" draw:auto-grow-height="false" fo:min-height="0.75cm" fo:min-width="11cm"/>
+   <style:graphic-properties draw:stroke="none" draw:fill="solid" draw:fill-color="#6495ed" loext:fill-use-slide-background="false" draw:textarea-horizontal-align="justify" draw:textarea-vertical-align="middle" draw:auto-grow-height="false" fo:min-height="0.75cm" fo:min-width="9.5cm"/>
    <style:paragraph-properties style:writing-mode="lr-tb"/>
   </style:style>
   <style:style style:name="gr2" style:family="graphic" style:parent-style-name="standard">
@@ -312,20 +315,24 @@
    <style:graphic-properties draw:stroke="none" draw:fill="solid" draw:fill-color="#6495ed" loext:fill-use-slide-background="false" draw:textarea-horizontal-align="justify" draw:textarea-vertical-align="middle" draw:auto-grow-height="false" fo:min-height="0.75cm" fo:min-width="6.5cm"/>
    <style:paragraph-properties style:writing-mode="lr-tb"/>
   </style:style>
+  <style:style style:name="gr7" style:family="graphic" style:parent-style-name="standard">
+   <style:graphic-properties draw:stroke="none" draw:fill="solid" draw:fill-color="#6495ed" loext:fill-use-slide-background="false" draw:textarea-horizontal-align="justify" draw:textarea-vertical-align="middle" draw:auto-grow-height="false" fo:min-height="0.75cm" fo:min-width="14cm"/>
+   <style:paragraph-properties style:writing-mode="lr-tb"/>
+  </style:style>
   <style:style style:name="P1" style:family="paragraph">
-   <style:paragraph-properties fo:text-align="center"/>
+   <style:paragraph-properties fo:text-align="center" style:writing-mode="lr-tb"/>
   </style:style>
   <style:style style:name="P2" style:family="paragraph">
    <loext:graphic-properties draw:fill="solid" draw:fill-color="#6495ed"/>
-   <style:paragraph-properties fo:text-align="center"/>
+   <style:paragraph-properties fo:text-align="center" style:writing-mode="lr-tb"/>
   </style:style>
   <style:style style:name="P3" style:family="paragraph">
    <loext:graphic-properties draw:fill-color="#f0e68c"/>
-   <style:paragraph-properties fo:text-align="center"/>
+   <style:paragraph-properties fo:text-align="center" style:writing-mode="lr-tb"/>
   </style:style>
   <style:style style:name="P4" style:family="paragraph">
    <loext:graphic-properties draw:fill-color="#ff6347"/>
-   <style:paragraph-properties fo:text-align="center"/>
+   <style:paragraph-properties fo:text-align="center" style:writing-mode="lr-tb"/>
   </style:style>
   <style:style style:name="P5" style:family="paragraph">
    <loext:graphic-properties draw:fill="solid" draw:fill-color="#c0c0c0"/>
@@ -387,7 +394,7 @@
  <office:body>
   <office:drawing>
    <draw:page draw:name="page1" draw:style-name="dp2" draw:master-page-name="Standaard">
-    <draw:custom-shape draw:style-name="gr1" draw:text-style-name="P2" draw:layer="layout" svg:width="11.5cm" svg:height="1cm" svg:x="7cm" svg:y="1.5cm">
+    <draw:custom-shape draw:style-name="gr1" draw:text-style-name="P2" draw:layer="layout" svg:width="10cm" svg:height="1cm" svg:x="7cm" svg:y="1.5cm">
      <text:p text:style-name="P1">Sequence #1</text:p>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
@@ -395,7 +402,7 @@
      <text:p text:style-name="P1">barcode</text:p>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr3" draw:text-style-name="P4" draw:layer="layout" svg:width="3cm" svg:height="1cm" svg:x="18.5cm" svg:y="1.5cm">
+    <draw:custom-shape draw:style-name="gr3" draw:text-style-name="P4" draw:layer="layout" svg:width="3cm" svg:height="1cm" svg:x="17cm" svg:y="1.5cm">
      <text:p text:style-name="P1">adapter</text:p>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
@@ -423,11 +430,11 @@
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="19.5cm" svg:y="4cm">
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="18cm" svg:y="4cm">
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="17.5cm" svg:y="4cm">
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="16cm" svg:y="4cm">
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
@@ -435,19 +442,15 @@
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="15.5cm" svg:y="4cm">
-     <text:p/>
-     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
-    </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="13.5cm" svg:y="4cm">
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="14cm" svg:y="4cm">
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="11.5cm" svg:y="4cm">
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="12cm" svg:y="4cm">
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
-    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="11cm" svg:y="3cm">
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="10cm" svg:y="4cm">
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
@@ -503,6 +506,66 @@
      <text:p/>
      <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
     </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr7" draw:text-style-name="P2" draw:layer="layout" svg:width="14.5cm" svg:height="1cm" svg:x="7cm" svg:y="9.5cm">
+     <text:p text:style-name="P1">Sequence #3</text:p>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr2" draw:text-style-name="P3" draw:layer="layout" svg:width="3cm" svg:height="1cm" svg:x="4cm" svg:y="9.5cm">
+     <text:p text:style-name="P1">barcode</text:p>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr3" draw:text-style-name="P4" draw:layer="layout" svg:width="3cm" svg:height="1cm" svg:x="21.5cm" svg:y="9.5cm">
+     <text:p text:style-name="P1">adapter</text:p>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr3" draw:text-style-name="P4" draw:layer="layout" svg:width="3cm" svg:height="1cm" svg:x="1cm" svg:y="9.5cm">
+     <text:p text:style-name="P1">adapter</text:p>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="1cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="3cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="7cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="9cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="22.5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="20.5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="1cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="18.5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="16.5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
+    <draw:custom-shape draw:style-name="gr5" draw:text-style-name="P5" draw:layer="layout" svg:width="2cm" svg:height="0.5cm" svg:x="14.5cm" svg:y="11cm">
+     <text:p/>
+     <draw:enhanced-geometry svg:viewBox="0 0 21600 21600" draw:type="rectangle" draw:enhanced-path="M 0 0 L 21600 0 21600 21600 0 21600 0 0 Z N"/>
+    </draw:custom-shape>
    </draw:page>
   </office:drawing>
  </office:body>
diff --git a/docs/_static/images/overrepresented_sampling.svg b/docs/_static/images/overrepresented_sampling.svg
index 3cf7716..45871bd 100644
--- a/docs/_static/images/overrepresented_sampling.svg
+++ b/docs/_static/images/overrepresented_sampling.svg
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg version="1.2" width="205.19mm" height="70.1mm" viewBox="991 1500 20519 7010" preserveAspectRatio="xMidYMid" fill-rule="evenodd" stroke-width="28.222" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg" xmlns:ooo="http://xml.openoffice.org/svg/export" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:presentation="http://sun.com/xmlns/staroffice/presentation" xmlns:smil="http://www.w3.org/2001/SMIL20/" xmlns:anim="urn:oasis:names:tc:opendocument:xmlns:animation:1.0" xml:space="preserve">
+<svg version="1.2" width="235.19mm" height="100.1mm" viewBox="991 1500 23519 10010" preserveAspectRatio="xMidYMid" fill-rule="evenodd" stroke-width="28.222" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg" xmlns:ooo="http://xml.openoffice.org/svg/export" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:presentation="http://sun.com/xmlns/staroffice/presentation" xmlns:smil="http://www.w3.org/2001/SMIL20/" xmlns:anim="urn:oasis:names:tc:opendocument:xmlns:animation:1.0" xml:space="preserve">
  <defs>
   <font id="EmbeddedFont_1" horiz-adv-x="2048">
    <font-face font-family="Liberation Sans embedded" units-per-em="2048" font-weight="normal" font-style="normal" ascent="1842" descent="419"/>
@@ -18,6 +18,7 @@
    <glyph unicode="b" horiz-adv-x="950" d="M 1053,546 C 1053,169 920,-20 655,-20 573,-20 505,-5 451,25 396,54 352,102 318,168 L 316,168 C 316,147 315,116 312,74 309,31 307,7 306,0 L 132,0 C 136,36 138,110 138,223 L 138,1484 318,1484 318,1061 C 318,1018 317,967 314,908 L 318,908 C 351,977 396,1027 451,1057 506,1087 574,1102 655,1102 792,1102 892,1056 957,964 1021,872 1053,733 1053,546 Z M 864,540 C 864,691 844,800 804,865 764,930 699,963 609,963 508,963 434,928 388,859 341,790 318,680 318,529 318,387 341,282 386,215 431,147 505,113 607,113 698,113 763,147 804,214 844,281 864,389 864,540 Z"/>
    <glyph unicode="a" horiz-adv-x="1061" d="M 414,-20 C 305,-20 224,9 169,66 114,123 87,202 87,302 87,414 124,500 198,560 271,620 390,652 554,656 L 797,660 797,719 C 797,807 778,870 741,908 704,946 645,965 565,965 484,965 426,951 389,924 352,897 330,853 323,793 L 135,810 C 166,1005 310,1102 569,1102 705,1102 807,1071 876,1009 945,946 979,856 979,738 L 979,272 C 979,219 986,179 1000,152 1014,125 1041,111 1080,111 1097,111 1117,113 1139,118 L 1139,6 C 1094,-5 1047,-10 1000,-10 933,-10 885,8 855,43 824,78 807,132 803,207 L 797,207 C 751,124 698,66 637,32 576,-3 501,-20 414,-20 Z M 455,115 C 521,115 580,130 631,160 682,190 723,231 753,284 782,336 797,390 797,445 L 797,534 600,530 C 515,529 451,520 408,504 364,488 330,463 307,430 284,397 272,353 272,299 272,240 288,195 320,163 351,131 396,115 455,115 Z"/>
    <glyph unicode="S" horiz-adv-x="1201" d="M 1272,389 C 1272,259 1221,158 1120,87 1018,16 875,-20 690,-20 347,-20 148,99 93,338 L 278,375 C 299,290 345,228 414,189 483,149 578,129 697,129 820,129 916,150 983,193 1050,235 1083,297 1083,379 1083,425 1073,462 1052,491 1031,520 1001,543 963,562 925,581 880,596 827,609 774,622 716,635 652,650 541,675 456,699 399,724 341,749 295,776 262,807 229,837 203,872 186,913 168,954 159,1000 159,1053 159,1174 205,1267 298,1332 390,1397 522,1430 694,1430 854,1430 976,1406 1061,1357 1146,1308 1205,1224 1239,1106 L 1051,1073 C 1030,1148 991,1202 933,1236 875,1269 795,1286 692,1286 579,1286 493,1267 434,1230 375,1193 345,1137 345,1063 345,1020 357,984 380,956 403,927 436,903 479,884 522,864 609,840 738,811 781,801 825,791 868,781 911,770 952,758 991,744 1030,729 1067,712 1102,693 1136,674 1166,650 1191,622 1216,594 1236,561 1251,523 1265,485 1272,440 1272,389 Z"/>
+   <glyph unicode="3" horiz-adv-x="1006" d="M 1049,389 C 1049,259 1008,158 925,87 842,16 724,-20 571,-20 428,-20 315,12 230,77 145,141 94,236 78,362 L 264,379 C 288,212 390,129 571,129 662,129 733,151 785,196 836,241 862,307 862,395 862,472 833,532 774,575 715,618 629,639 518,639 L 416,639 416,795 514,795 C 613,795 689,817 744,860 798,903 825,962 825,1038 825,1113 803,1173 759,1217 714,1260 648,1282 561,1282 482,1282 418,1262 369,1221 320,1180 291,1123 283,1049 L 102,1063 C 115,1178 163,1268 246,1333 328,1398 434,1430 563,1430 704,1430 814,1397 893,1332 971,1266 1010,1174 1010,1057 1010,967 985,894 935,838 884,781 811,743 715,723 L 715,719 C 820,708 902,672 961,613 1020,554 1049,479 1049,389 Z"/>
    <glyph unicode="2" horiz-adv-x="950" d="M 103,0 L 103,127 C 137,205 179,274 228,334 277,393 328,447 382,496 436,544 490,589 543,630 596,671 643,713 686,754 729,795 763,839 790,884 816,929 829,981 829,1038 829,1115 806,1175 761,1218 716,1261 653,1282 572,1282 495,1282 432,1261 383,1220 333,1178 304,1119 295,1044 L 111,1061 C 124,1174 172,1263 255,1330 337,1397 443,1430 572,1430 714,1430 823,1397 900,1330 976,1263 1014,1167 1014,1044 1014,989 1002,935 977,881 952,827 914,773 865,719 816,665 721,581 582,468 505,405 444,349 399,299 354,248 321,200 301,153 L 1036,153 1036,0 103,0 Z"/>
    <glyph unicode="1" horiz-adv-x="922" d="M 156,0 L 156,153 515,153 515,1237 197,1010 197,1180 530,1409 696,1409 696,153 1039,153 1039,0 156,0 Z"/>
    <glyph unicode="#" horiz-adv-x="1145" d="M 896,885 L 818,516 1078,516 1078,408 795,408 707,0 597,0 683,408 320,408 236,0 126,0 210,408 9,408 9,516 234,516 312,885 60,885 60,993 334,993 423,1401 533,1401 445,993 808,993 896,1401 1006,1401 918,993 1129,993 1129,885 896,885 Z M 425,885 L 345,516 707,516 785,885 425,885 Z"/>
@@ -59,9 +60,9 @@
  <g class="Page">
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id3">
-    <rect class="BoundingBox" stroke="none" fill="none" x="7000" y="1500" width="11501" height="1001"/>
-    <path fill="rgb(100,149,237)" stroke="none" d="M 12750,2500 L 7000,2500 7000,1500 18500,1500 18500,2500 12750,2500 Z"/>
-    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="10885" y="2221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">Sequence #1</tspan></tspan></tspan></text>
+    <rect class="BoundingBox" stroke="none" fill="none" x="7000" y="1500" width="10001" height="1001"/>
+    <path fill="rgb(100,149,237)" stroke="none" d="M 12000,2500 L 7000,2500 7000,1500 17000,1500 17000,2500 12000,2500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="10135" y="2221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">Sequence #1</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
@@ -73,9 +74,9 @@
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id5">
-    <rect class="BoundingBox" stroke="none" fill="none" x="18500" y="1500" width="3001" height="1001"/>
-    <path fill="rgb(255,99,71)" stroke="none" d="M 20000,2500 L 18500,2500 18500,1500 21500,1500 21500,2500 20000,2500 Z"/>
-    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="18927" y="2221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
+    <rect class="BoundingBox" stroke="none" fill="none" x="17000" y="1500" width="3001" height="1001"/>
+    <path fill="rgb(255,99,71)" stroke="none" d="M 18500,2500 L 17000,2500 17000,1500 20000,1500 20000,2500 18500,2500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="17427" y="2221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
@@ -122,16 +123,16 @@
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id12">
-    <rect class="BoundingBox" stroke="none" fill="none" x="19491" y="3991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 20500,4500 L 19500,4500 19500,4000 21500,4000 21500,4500 20500,4500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 20500,4500 L 19500,4500 19500,4000 21500,4000 21500,4500 20500,4500 Z"/>
+    <rect class="BoundingBox" stroke="none" fill="none" x="17991" y="3991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 19000,4500 L 18000,4500 18000,4000 20000,4000 20000,4500 19000,4500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 19000,4500 L 18000,4500 18000,4000 20000,4000 20000,4500 19000,4500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id13">
-    <rect class="BoundingBox" stroke="none" fill="none" x="17491" y="3991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 18500,4500 L 17500,4500 17500,4000 19500,4000 19500,4500 18500,4500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 18500,4500 L 17500,4500 17500,4000 19500,4000 19500,4500 18500,4500 Z"/>
+    <rect class="BoundingBox" stroke="none" fill="none" x="15991" y="3991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 17000,4500 L 16000,4500 16000,4000 18000,4000 18000,4500 17000,4500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 17000,4500 L 16000,4500 16000,4000 18000,4000 18000,4500 17000,4500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
@@ -143,122 +144,220 @@
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id15">
-    <rect class="BoundingBox" stroke="none" fill="none" x="15491" y="3991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 16500,4500 L 15500,4500 15500,4000 17500,4000 17500,4500 16500,4500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 16500,4500 L 15500,4500 15500,4000 17500,4000 17500,4500 16500,4500 Z"/>
+    <rect class="BoundingBox" stroke="none" fill="none" x="13991" y="3991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 15000,4500 L 14000,4500 14000,4000 16000,4000 16000,4500 15000,4500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 15000,4500 L 14000,4500 14000,4000 16000,4000 16000,4500 15000,4500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id16">
-    <rect class="BoundingBox" stroke="none" fill="none" x="13491" y="3991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 14500,4500 L 13500,4500 13500,4000 15500,4000 15500,4500 14500,4500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 14500,4500 L 13500,4500 13500,4000 15500,4000 15500,4500 14500,4500 Z"/>
+    <rect class="BoundingBox" stroke="none" fill="none" x="11991" y="3991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 13000,4500 L 12000,4500 12000,4000 14000,4000 14000,4500 13000,4500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 13000,4500 L 12000,4500 12000,4000 14000,4000 14000,4500 13000,4500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id17">
-    <rect class="BoundingBox" stroke="none" fill="none" x="11491" y="3991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 12500,4500 L 11500,4500 11500,4000 13500,4000 13500,4500 12500,4500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 12500,4500 L 11500,4500 11500,4000 13500,4000 13500,4500 12500,4500 Z"/>
+    <rect class="BoundingBox" stroke="none" fill="none" x="9991" y="3991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 11000,4500 L 10000,4500 10000,4000 12000,4000 12000,4500 11000,4500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 11000,4500 L 10000,4500 10000,4000 12000,4000 12000,4500 11000,4500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
    <g id="id18">
-    <rect class="BoundingBox" stroke="none" fill="none" x="10991" y="2991" width="2019" height="519"/>
-    <path fill="rgb(192,192,192)" stroke="none" d="M 12000,3500 L 11000,3500 11000,3000 13000,3000 13000,3500 12000,3500 Z"/>
-    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 12000,3500 L 11000,3500 11000,3000 13000,3000 13000,3500 12000,3500 Z"/>
-   </g>
-  </g>
-  <g class="com.sun.star.drawing.CustomShape">
-   <g id="id19">
     <rect class="BoundingBox" stroke="none" fill="none" x="7000" y="5500" width="7001" height="1001"/>
     <path fill="rgb(100,149,237)" stroke="none" d="M 10500,6500 L 7000,6500 7000,5500 14000,5500 14000,6500 10500,6500 Z"/>
     <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="8635" y="6221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">Sequence #2</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id20">
+   <g id="id19">
     <rect class="BoundingBox" stroke="none" fill="none" x="4000" y="5500" width="3001" height="1001"/>
     <path fill="rgb(240,230,140)" stroke="none" d="M 5500,6500 L 4000,6500 4000,5500 7000,5500 7000,6500 5500,6500 Z"/>
     <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="4357" y="6221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">barcode</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id21">
+   <g id="id20">
     <rect class="BoundingBox" stroke="none" fill="none" x="14000" y="5500" width="3001" height="1001"/>
     <path fill="rgb(255,99,71)" stroke="none" d="M 15500,6500 L 14000,6500 14000,5500 17000,5500 17000,6500 15500,6500 Z"/>
     <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="14427" y="6221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id22">
+   <g id="id21">
     <rect class="BoundingBox" stroke="none" fill="none" x="1000" y="5500" width="3001" height="1001"/>
     <path fill="rgb(255,99,71)" stroke="none" d="M 2500,6500 L 1000,6500 1000,5500 4000,5500 4000,6500 2500,6500 Z"/>
     <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="1427" y="6221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id23">
+   <g id="id22">
     <rect class="BoundingBox" stroke="none" fill="none" x="991" y="6991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 2000,7500 L 1000,7500 1000,7000 3000,7000 3000,7500 2000,7500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 2000,7500 L 1000,7500 1000,7000 3000,7000 3000,7500 2000,7500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id24">
+   <g id="id23">
     <rect class="BoundingBox" stroke="none" fill="none" x="2991" y="6991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 4000,7500 L 3000,7500 3000,7000 5000,7000 5000,7500 4000,7500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 4000,7500 L 3000,7500 3000,7000 5000,7000 5000,7500 4000,7500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id25">
+   <g id="id24">
     <rect class="BoundingBox" stroke="none" fill="none" x="4991" y="6991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 6000,7500 L 5000,7500 5000,7000 7000,7000 7000,7500 6000,7500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 6000,7500 L 5000,7500 5000,7000 7000,7000 7000,7500 6000,7500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id26">
+   <g id="id25">
     <rect class="BoundingBox" stroke="none" fill="none" x="6991" y="6991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 8000,7500 L 7000,7500 7000,7000 9000,7000 9000,7500 8000,7500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 8000,7500 L 7000,7500 7000,7000 9000,7000 9000,7500 8000,7500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id27">
+   <g id="id26">
     <rect class="BoundingBox" stroke="none" fill="none" x="8991" y="7991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 10000,8500 L 9000,8500 9000,8000 11000,8000 11000,8500 10000,8500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 10000,8500 L 9000,8500 9000,8000 11000,8000 11000,8500 10000,8500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id28">
+   <g id="id27">
     <rect class="BoundingBox" stroke="none" fill="none" x="991" y="6991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 2000,7500 L 1000,7500 1000,7000 3000,7000 3000,7500 2000,7500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 2000,7500 L 1000,7500 1000,7000 3000,7000 3000,7500 2000,7500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id29">
+   <g id="id28">
     <rect class="BoundingBox" stroke="none" fill="none" x="12991" y="7991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 14000,8500 L 13000,8500 13000,8000 15000,8000 15000,8500 14000,8500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 14000,8500 L 13000,8500 13000,8000 15000,8000 15000,8500 14000,8500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id30">
+   <g id="id29">
     <rect class="BoundingBox" stroke="none" fill="none" x="10991" y="7991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 12000,8500 L 11000,8500 11000,8000 13000,8000 13000,8500 12000,8500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 12000,8500 L 11000,8500 11000,8000 13000,8000 13000,8500 12000,8500 Z"/>
    </g>
   </g>
   <g class="com.sun.star.drawing.CustomShape">
-   <g id="id31">
+   <g id="id30">
     <rect class="BoundingBox" stroke="none" fill="none" x="14991" y="7991" width="2019" height="519"/>
     <path fill="rgb(192,192,192)" stroke="none" d="M 16000,8500 L 15000,8500 15000,8000 17000,8000 17000,8500 16000,8500 Z"/>
     <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 16000,8500 L 15000,8500 15000,8000 17000,8000 17000,8500 16000,8500 Z"/>
    </g>
   </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id31">
+    <rect class="BoundingBox" stroke="none" fill="none" x="7000" y="9500" width="14501" height="1001"/>
+    <path fill="rgb(100,149,237)" stroke="none" d="M 14250,10500 L 7000,10500 7000,9500 21500,9500 21500,10500 14250,10500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="12385" y="10221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">Sequence #3</tspan></tspan></tspan></text>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id32">
+    <rect class="BoundingBox" stroke="none" fill="none" x="4000" y="9500" width="3001" height="1001"/>
+    <path fill="rgb(240,230,140)" stroke="none" d="M 5500,10500 L 4000,10500 4000,9500 7000,9500 7000,10500 5500,10500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="4357" y="10221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">barcode</tspan></tspan></tspan></text>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id33">
+    <rect class="BoundingBox" stroke="none" fill="none" x="21500" y="9500" width="3001" height="1001"/>
+    <path fill="rgb(255,99,71)" stroke="none" d="M 23000,10500 L 21500,10500 21500,9500 24500,9500 24500,10500 23000,10500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="21927" y="10221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id34">
+    <rect class="BoundingBox" stroke="none" fill="none" x="1000" y="9500" width="3001" height="1001"/>
+    <path fill="rgb(255,99,71)" stroke="none" d="M 2500,10500 L 1000,10500 1000,9500 4000,9500 4000,10500 2500,10500 Z"/>
+    <text class="SVGTextShape"><tspan class="TextParagraph" font-family="Liberation Sans, sans-serif" font-size="635px" font-weight="400"><tspan class="TextPosition" x="1427" y="10221"><tspan fill="rgb(0,0,0)" stroke="none" style="white-space: pre">adapter</tspan></tspan></tspan></text>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id35">
+    <rect class="BoundingBox" stroke="none" fill="none" x="991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 2000,11500 L 1000,11500 1000,11000 3000,11000 3000,11500 2000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 2000,11500 L 1000,11500 1000,11000 3000,11000 3000,11500 2000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id36">
+    <rect class="BoundingBox" stroke="none" fill="none" x="2991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 4000,11500 L 3000,11500 3000,11000 5000,11000 5000,11500 4000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 4000,11500 L 3000,11500 3000,11000 5000,11000 5000,11500 4000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id37">
+    <rect class="BoundingBox" stroke="none" fill="none" x="4991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 6000,11500 L 5000,11500 5000,11000 7000,11000 7000,11500 6000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 6000,11500 L 5000,11500 5000,11000 7000,11000 7000,11500 6000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id38">
+    <rect class="BoundingBox" stroke="none" fill="none" x="6991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 8000,11500 L 7000,11500 7000,11000 9000,11000 9000,11500 8000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 8000,11500 L 7000,11500 7000,11000 9000,11000 9000,11500 8000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id39">
+    <rect class="BoundingBox" stroke="none" fill="none" x="8991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 10000,11500 L 9000,11500 9000,11000 11000,11000 11000,11500 10000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 10000,11500 L 9000,11500 9000,11000 11000,11000 11000,11500 10000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id40">
+    <rect class="BoundingBox" stroke="none" fill="none" x="22491" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 23500,11500 L 22500,11500 22500,11000 24500,11000 24500,11500 23500,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 23500,11500 L 22500,11500 22500,11000 24500,11000 24500,11500 23500,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id41">
+    <rect class="BoundingBox" stroke="none" fill="none" x="20491" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 21500,11500 L 20500,11500 20500,11000 22500,11000 22500,11500 21500,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 21500,11500 L 20500,11500 20500,11000 22500,11000 22500,11500 21500,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id42">
+    <rect class="BoundingBox" stroke="none" fill="none" x="991" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 2000,11500 L 1000,11500 1000,11000 3000,11000 3000,11500 2000,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 2000,11500 L 1000,11500 1000,11000 3000,11000 3000,11500 2000,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id43">
+    <rect class="BoundingBox" stroke="none" fill="none" x="18491" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 19500,11500 L 18500,11500 18500,11000 20500,11000 20500,11500 19500,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 19500,11500 L 18500,11500 18500,11000 20500,11000 20500,11500 19500,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id44">
+    <rect class="BoundingBox" stroke="none" fill="none" x="16491" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 17500,11500 L 16500,11500 16500,11000 18500,11000 18500,11500 17500,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 17500,11500 L 16500,11500 16500,11000 18500,11000 18500,11500 17500,11500 Z"/>
+   </g>
+  </g>
+  <g class="com.sun.star.drawing.CustomShape">
+   <g id="id45">
+    <rect class="BoundingBox" stroke="none" fill="none" x="14491" y="10991" width="2019" height="519"/>
+    <path fill="rgb(192,192,192)" stroke="none" d="M 15500,11500 L 14500,11500 14500,11000 16500,11000 16500,11500 15500,11500 Z"/>
+    <path fill="none" stroke="rgb(0,0,0)" stroke-width="18" stroke-linejoin="round" d="M 15500,11500 L 14500,11500 14500,11000 16500,11000 16500,11500 15500,11500 Z"/>
+   </g>
+  </g>
  </g>
 </svg>
\ No newline at end of file
diff --git a/docs/module_options.rst b/docs/module_options.rst
index a4d2cc9..4fd53e7 100644
--- a/docs/module_options.rst
+++ b/docs/module_options.rst
@@ -53,6 +53,10 @@ ends towards the middle. This means that the first and last fragment will
 always be the first 21 bp of the beginning and the last 21 bp in the end. As
 such the adapter sequences will always be sampled in the same frame.
 
+To prevent many common genome repeats from ending up in the report, Sequali
+limits the amount of sampling inwards to approximately 100 bp. With the default
+fragment size of 21 bp, this means that 5 fragments are taken from each side.
+
 .. figure:: _static/images/overrepresented_sampling.svg
 
     This figure shows how fragments are sampled in sequali. The silver elements
@@ -60,9 +64,9 @@ such the adapter sequences will always be sampled in the same frame.
     sampled. Despite the length differences between sequence #1 and sequence #2
     the fragments for the adapter and barcode sequences are the same.
     In Sequence #1 the fragments sampled from the back end overlap somewhat
-    with sequences from the front end. This is a necessity to ensure all of the
-    sequence is sampled when the length is not divisible by the size of the
-    fragments.
+    with sequences from the front end. In sequence #3 the sequence is quite
+    long so the middle is not sampled. This prevents many common genome repeats
+    from ending up in the report.
 
 For each sequence only unique fragments within the sequence are sampled, to
 avoid overrepresenting common genomic repeats.
@@ -89,6 +93,10 @@ The following command line parameters affect this module:
   store.
 + ``--overrepresentation-sample-every``: How often a sequence is sampled. Default
   is every 8 sequences.
++ ``--overrepresentation-bases-from-start``: the minimum amount of bases to
+  take from the start of the sequence. Default is 100.
++ ``--overrepresentation-bases-from-end``: the minimum amount of bases to
+  take from the end of the sequence. Default is 100.
 
 .. [#F1] A canonical k-mer is the k-mer that has the lowest sort order compared
          to itself and its reverse complement. This way the canonical-kmer is