From 4becefff40786e431ec4815b62200ceb79669796 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 2 Dec 2024 20:10:09 +0100
Subject: [PATCH 1/3] Refactor tags processing

---
 src/sequali/_qc.pyi        |   4 +-
 src/sequali/_qcmodule.c    | 267 ++++++++++++++++++++-----------------
 tests/test_bam_parser.py   |  13 ++
 tests/test_fastq_parser.py |   3 +
 tests/test_nano_stats.py   |  48 ++++++-
 5 files changed, 210 insertions(+), 125 deletions(-)

diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi
index 77fdaf8..fd8b02c 100644
--- a/src/sequali/_qc.pyi
+++ b/src/sequali/_qc.pyi
@@ -43,10 +43,12 @@ INSERT_SIZE_MAX_ADAPTER_STORE_SIZE: int
 
 class FastqRecordView:
     obj: bytes
-    def __init__(self, __name, __sequence, __qualities) -> None: ...
+    def __init__(self, name: str, sequence: str, qualities: str, 
+                 tags: Optional[bytes] = None) -> None: ...
     def name(self) -> str: ...
     def sequence(self) -> str: ...
     def qualities(self) -> str: ...
+    def tags(self) -> bytes: ...
 
 class FastqRecordArrayView:
     obj: bytes
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index cc7c906..fd2109f 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -345,13 +345,11 @@ struct FastqMeta {
         uint32_t qualities_length;
     };
     uint32_t qualities_offset;
+    uint32_t tags_offset;
+    uint32_t tags_length;
     /* Store the accumulated error once calculated so it can be reused by
        the NanoStats module */
     double accumulated_error_rate;
-    // Nanopore specific metadata
-    time_t start_time;
-    float duration;
-    int32_t channel;
 };
 
 typedef struct _FastqRecordViewStruct {
@@ -375,10 +373,11 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     PyObject *name_obj = NULL;
     PyObject *sequence_obj = NULL;
     PyObject *qualities_obj = NULL;
-    static char *kwargnames[] = {"name", "sequence", "qualities", NULL};
-    static char *format = "UUU|:FastqRecordView";
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames,
-                                     &name_obj, &sequence_obj, &qualities_obj)) {
+    PyObject *tags_obj = NULL;
+    static char *kwargnames[] = {"name", "sequence", "qualities", "tags", NULL};
+    static char *format = "UUU|S:FastqRecordView";
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &name_obj,
+                                     &sequence_obj, &qualities_obj, &tags_obj)) {
         return NULL;
     }
     // Unicode check already performed by the parser, so error checking can be
@@ -422,8 +421,14 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
             sequence_length, qualities_length);
         return NULL;
     }
-
-    size_t total_length = name_length + sequence_length + qualities_length + 6;
+    Py_ssize_t tags_length = 0;
+    char *tags = NULL;
+    if (tags_obj != NULL) {
+        tags_length = PyBytes_Size(tags_obj);
+        tags = PyBytes_AsString(tags_obj);
+    }
+    size_t total_length =
+        name_length + sequence_length + qualities_length + 6 + tags_length;
     if (total_length > UINT32_MAX) {
         // lengths are saved as uint32_t types so throw an error;
         PyErr_Format(
@@ -460,10 +465,9 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     self->meta.sequence_offset = 2 + name_length;
     self->meta.sequence_length = sequence_length;
     self->meta.qualities_offset = 5 + name_length + sequence_length;
+    self->meta.tags_offset = name_length + sequence_length * 2 + 6;
+    self->meta.tags_length = tags_length;
     self->meta.accumulated_error_rate = accumulated_error_rate;
-    self->meta.duration = 0.0;
-    self->meta.start_time = 0;
-    self->meta.channel = -1;
     self->obj = bytes_obj;
 
     buffer[0] = '@';
@@ -482,6 +486,8 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     memcpy(buffer + cursor, qualities, sequence_length);
     cursor += sequence_length;
     buffer[cursor] = '\n';
+    cursor += 1;
+    memcpy(buffer + cursor, tags, tags_length);
     return (PyObject *)self;
 }
 
@@ -526,6 +532,19 @@ FastqRecordView_qualities(FastqRecordView *self, PyObject *Py_UNUSED(ignore))
         self->meta.sequence_length, NULL);
 }
 
+PyDoc_STRVAR(FastqRecordView_tags__doc__,
+             "tags($self)\n"
+             "--\n"
+             "\n"
+             "Returns the raw tags as a bytes object.\n");
+static PyObject *
+FastqRecordView_tags(FastqRecordView *self, PyObject *Py_UNUSED(ignore))
+{
+    return PyBytes_FromStringAndSize(
+        (char *)self->meta.record_start + self->meta.tags_offset,
+        self->meta.tags_length);
+}
+
 static PyMethodDef FastqRecordView_methods[] = {
     {"name", (PyCFunction)FastqRecordView_name, METH_NOARGS,
      FastqRecordView_name__doc__},
@@ -533,6 +552,8 @@ static PyMethodDef FastqRecordView_methods[] = {
      FastqRecordView_sequence__doc__},
     {"qualities", (PyCFunction)FastqRecordView_qualities, METH_NOARGS,
      FastqRecordView_qualities__doc__},
+    {"tags", (PyCFunction)FastqRecordView_tags, METH_NOARGS,
+     FastqRecordView_tags__doc__},
     {NULL}};
 
 static PyMemberDef FastqRecordView_members[] = {
@@ -635,8 +656,9 @@ FastqRecordArrayView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs
             return NULL;
         }
         FastqRecordView *record = (FastqRecordView *)item;
-        size_t memory_size =
-            6 + record->meta.name_length + record->meta.sequence_length * 2;
+        size_t memory_size = 6 + record->meta.name_length +
+                             record->meta.sequence_length * 2 +
+                             record->meta.tags_length;
         total_memory_size += memory_size;
         Py_DECREF(item);
     }
@@ -677,6 +699,8 @@ FastqRecordArrayView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs
         record_ptr += meta.sequence_length;
         record_ptr[0] = '\n';
         record_ptr += 1;
+        memcpy(record_ptr, meta.record_start + meta.tags_offset, meta.tags_length);
+        record_ptr += meta.tags_length;
         memcpy(metas + i, &record->meta, sizeof(struct FastqMeta));
         Py_DECREF(record);
     }
@@ -1123,10 +1147,9 @@ FastqParser_create_record_array(FastqParser *self, size_t min_records,
             meta->sequence_offset = sequence_start - record_start;
             meta->sequence_length = sequence_length;
             meta->qualities_offset = qualities_start - record_start;
+            meta->tags_offset = qualities_end + 1 - record_start;
+            meta->tags_length = 0;
             meta->accumulated_error_rate = 0.0;
-            meta->channel = -1;
-            meta->duration = 0.0;
-            meta->start_time = 0;
             record_start = qualities_end + 1;
         }
     }
@@ -1373,101 +1396,6 @@ struct BamRecordHeader {
     int32_t tlen;
 };
 
-static int
-bam_tags_to_fastq_meta(const uint8_t *tags, size_t tags_length,
-                       struct FastqMeta *meta)
-{
-    meta->channel = -1;
-    meta->duration = 0.0;
-    meta->start_time = 0;
-    while (tags_length > 0) {
-        if (tags_length < 4) {
-            PyErr_SetString(PyExc_ValueError, "truncated tags");
-            return -1;
-        }
-        const uint8_t *tag_id = tags;
-        uint8_t tag_type = tags[2];
-        bool is_array = false;
-        const uint8_t *value_start = tags + 3;
-        uint32_t array_length = 1;
-        if (tag_type == 'B') {
-            is_array = true;
-            value_start = tags + 8;
-            tag_type = tags[3];
-            if (tags_length < 8) {
-                PyErr_SetString(PyExc_ValueError, "truncated tags");
-                return -1;
-            }
-            array_length = *(uint32_t *)(tags + 4);
-        };
-        size_t value_length;
-        switch (tag_type) {
-            case 'A':
-                value_length = 1;
-                break;
-            case 'c':
-            case 'C':
-                /* A very annoying habit of htslib to store a tag in the
-                   smallest possible size rather than being consistent. */
-                value_length = 1;
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 4) {
-                    meta->channel = *(uint8_t *)(value_start);
-                }
-                break;
-            case 's':
-            case 'S':
-                value_length = 2;
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 5) {
-                    meta->channel = *(uint16_t *)(value_start);
-                }
-                break;
-            case 'I':
-            case 'i':
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 7) {
-                    meta->channel = *(uint32_t *)(value_start);
-                }
-                value_length = 4;
-                break;
-            case 'f':
-                if (memcmp(tag_id, "du", 2) == 0 && tags_length >= 7) {
-                    meta->duration = *(float *)(value_start);
-                }
-                value_length = 4;
-                break;
-            case 'Z':
-            case 'H':
-                if (is_array) {
-                    PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
-                                 tag_type);
-                    return -1;
-                }
-                uint8_t *string_end = memchr(value_start, 0, tags_length - 3);
-                if (string_end == NULL) {
-                    PyErr_SetString(PyExc_ValueError, "truncated tags");
-                    return -1;
-                }
-                value_length =
-                    (string_end - value_start) + 1;  // +1 for terminating null
-                if (memcmp(tag_id, "st", 2) == 0) {
-                    meta->start_time = time_string_to_timestamp(value_start);
-                }
-                break;
-            default:
-                PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
-                return -1;
-        }
-        size_t this_tag_length =
-            (value_start - tags) + array_length * value_length;
-        if (this_tag_length > tags_length) {
-            PyErr_SetString(PyExc_ValueError, "truncated tags");
-            return -1;
-        }
-        tags = tags + this_tag_length;
-        tags_length -= this_tag_length;
-    }
-    return 0;
-}
-
 static PyObject *
 BamParser__next__(BamParser *self)
 {
@@ -1629,7 +1557,8 @@ BamParser__next__(BamParser *self)
             fastq_buffer_cursor += seq_length;
             fastq_buffer_cursor[0] = '\n';
             fastq_buffer_cursor += 1;
-
+            memcpy(fastq_buffer_cursor, tag_start, tags_length);
+            fastq_buffer_cursor += tags_length;
             parsed_records += 1;
             if (parsed_records > self->meta_buffer_size) {
                 struct FastqMeta *tmp = PyMem_Realloc(
@@ -1644,15 +1573,15 @@ BamParser__next__(BamParser *self)
             uint32_t sequence_offset = 1 + name_length + 1;  // For '@' and '\n'
             uint32_t qualities_offset =
                 sequence_offset + seq_length + 3;  // for '\n+\n'
+            uint32_t tags_offset = qualities_offset + seq_length + 1;
             meta->record_start = fastq_buffer_record_start;
             meta->name_length = name_length;
             meta->sequence_offset = sequence_offset;
             meta->sequence_length = seq_length;
             meta->qualities_offset = qualities_offset;
+            meta->tags_offset = tags_offset;
+            meta->tags_length = tags_length;
             meta->accumulated_error_rate = 0.0;
-            if (bam_tags_to_fastq_meta(tag_start, tags_length, meta) < 0) {
-                return NULL;
-            }
             record_start = record_end;
             fastq_buffer_record_start = fastq_buffer_cursor;
         }
@@ -4773,6 +4702,100 @@ NanoInfo_from_header(const uint8_t *header, size_t header_length,
     return 0;
 }
 
+static int
+NanoInfo_from_tags(const uint8_t *tags, size_t tags_length, struct NanoInfo *info)
+{
+    info->channel_id = -1;
+    info->duration = 0.0;
+    info->start_time = 0;
+    while (tags_length > 0) {
+        if (tags_length < 4) {
+            PyErr_SetString(PyExc_ValueError, "truncated tags");
+            return -1;
+        }
+        const uint8_t *tag_id = tags;
+        uint8_t tag_type = tags[2];
+        bool is_array = false;
+        const uint8_t *value_start = tags + 3;
+        uint32_t array_length = 1;
+        if (tag_type == 'B') {
+            is_array = true;
+            value_start = tags + 8;
+            tag_type = tags[3];
+            if (tags_length < 8) {
+                PyErr_SetString(PyExc_ValueError, "truncated tags");
+                return -1;
+            }
+            array_length = *(uint32_t *)(tags + 4);
+        };
+        size_t value_length;
+        switch (tag_type) {
+            case 'A':
+                value_length = 1;
+                break;
+            case 'c':
+            case 'C':
+                /* A very annoying habit of htslib to store a tag in the
+                   smallest possible size rather than being consistent. */
+                value_length = 1;
+                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 4) {
+                    info->channel_id = *(uint8_t *)(value_start);
+                }
+                break;
+            case 's':
+            case 'S':
+                value_length = 2;
+                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 5) {
+                    info->channel_id = *(uint16_t *)(value_start);
+                }
+                break;
+            case 'I':
+            case 'i':
+                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 7) {
+                    info->channel_id = *(uint32_t *)(value_start);
+                }
+                value_length = 4;
+                break;
+            case 'f':
+                if (memcmp(tag_id, "du", 2) == 0 && tags_length >= 7) {
+                    info->duration = *(float *)(value_start);
+                }
+                value_length = 4;
+                break;
+            case 'Z':
+            case 'H':
+                if (is_array) {
+                    PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
+                                 tag_type);
+                    return -1;
+                }
+                uint8_t *string_end = memchr(value_start, 0, tags_length - 3);
+                if (string_end == NULL) {
+                    PyErr_SetString(PyExc_ValueError, "truncated tags");
+                    return -1;
+                }
+                value_length =
+                    (string_end - value_start) + 1;  // +1 for terminating null
+                if (memcmp(tag_id, "st", 2) == 0) {
+                    info->start_time = time_string_to_timestamp(value_start);
+                }
+                break;
+            default:
+                PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
+                return -1;
+        }
+        size_t this_tag_length =
+            (value_start - tags) + array_length * value_length;
+        if (this_tag_length > tags_length) {
+            PyErr_SetString(PyExc_ValueError, "truncated tags");
+            return -1;
+        }
+        tags = tags + this_tag_length;
+        tags_length -= this_tag_length;
+    }
+    return 0;
+}
+
 /**
  * @brief Add a FASTQ record to the NanoStats module
  *
@@ -4803,11 +4826,11 @@ NanoStats_add_meta(NanoStats *self, struct FastqMeta *meta)
     size_t sequence_length = meta->sequence_length;
     info->length = sequence_length;
 
-    if (meta->channel != -1) {
-        /* Already parsed from BAM */
-        info->channel_id = meta->channel;
-        info->duration = meta->duration;
-        info->start_time = meta->start_time;
+    if (meta->tags_length) {
+        if (NanoInfo_from_tags(meta->record_start + meta->tags_offset,
+                               meta->tags_length, info) != 0) {
+            return -1;
+        }
     }
     else if (NanoInfo_from_header(meta->record_start + 1, meta->name_length,
                                   info) != 0) {
diff --git a/tests/test_bam_parser.py b/tests/test_bam_parser.py
index c94380d..d85be65 100644
--- a/tests/test_bam_parser.py
+++ b/tests/test_bam_parser.py
@@ -41,12 +41,15 @@ def test_bam_parser():
     assert records[0].name() == "Myheader"
     assert records[0].sequence() == "GATTACA"
     assert records[0].qualities() == "HHHHHHH"
+    assert records[0].tags() == b"RGZA\x00"
     assert records[1].name() == "AnotherHeader"
     assert records[1].sequence() == "ACATTAG"
     assert records[1].qualities() == "KKKKKKK"
+    assert records[1].tags() == b"RGZA\x00"
     assert records[2].name() == "YetAnotherHeader"
     assert records[2].sequence() == "AAAATTTT"
     assert records[2].qualities() == "XKLLCCCC"
+    assert records[2].tags() == b"RGZA\x00"
     assert len(records) == 3
 
 
@@ -64,6 +67,9 @@ def test_bam_parser_sequences_with_extended_header():
     assert (records[0].qualities() ==
             "CCCFFFFFHFFHHJIJJIJGGJJJJJJJJJJJJJIGHIIEHIJJJJJJIJJJJIBGGIIIHIIII"
             "HHHHDD;9CCDEDDDDDDDDDDEDDDDDDDDDDDDD")
+    assert (records[0].tags() ==
+            b"X0C\x01X1C\x00MDZ72G28\x00PGZMarkDuplicates\x00RGZ1\x00XGC\x00"
+            b"AMC%NMC\x01SMC%XMC\x01XOC\x00XTAU")
     assert records[1].name() == "HWI-D00119:50:H7AP8ADXX:1:2104:18479:82511"
     assert (records[1].sequence() ==
             "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT"
@@ -71,6 +77,9 @@ def test_bam_parser_sequences_with_extended_header():
     assert (records[1].qualities() ==
             "CCCFFFFFHFFHHJJJJIJJJJIIJJJJJGIJJJJGIJJJJJJJJGJIJJIJJJGHIJJJJJJJI"
             "HHHHDD@>CDDEDDDDDDDDDDEDDCDDDDD?BBD9")
+    assert (records[1].tags() ==
+            b"X0C\x01X1C\x00MDZ72G28\x00PGZMarkDuplicates\x00RGZ1\x00XGC\x00"
+            b"AMC%NMC\x01SMC%XMC\x01XOC\x00XTAU")
     assert records[2].name() == "HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015"
     assert (records[2].sequence() ==
             "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT"
@@ -78,6 +87,9 @@ def test_bam_parser_sequences_with_extended_header():
     assert (records[2].qualities() ==
             "@@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJ"
             "HHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD")
+    assert (records[2].tags() ==
+            b"X0C\x01X1C\x00MDZ72G28\x00PGZMarkDuplicates\x00RGZ1\x00XGC\x00"
+            b"AMC%NMC\x01SMC%XMC\x01XOC\x00XTAU")
 
 
 @pytest.mark.parametrize("end", range(len(COMPLETE_HEADER) + 1,
@@ -143,6 +155,7 @@ def test_bam_parser_no_quals():
     assert records[0][0].name() == "Myheader"
     assert records[0][0].sequence() == "GATTACA"
     assert records[0][0].qualities() == "!!!!!!!"
+    assert records[0][0].tags() == b"RGZA\x00"
 
 
 def test_bam_parser_skip_secondary_supplementary():
diff --git a/tests/test_fastq_parser.py b/tests/test_fastq_parser.py
index 2e2f0eb..a570a74 100644
--- a/tests/test_fastq_parser.py
+++ b/tests/test_fastq_parser.py
@@ -34,12 +34,15 @@ def test_fastq_parser():
     assert records[0].name() == "Myheader/1"
     assert records[0].sequence() == "GATTACA"
     assert records[0].qualities() == "HHHHHHH"
+    assert records[0].tags() == b""
     assert records[1].name() == "AnotherHeader/1"
     assert records[1].sequence() == "ACATTAG"
     assert records[1].qualities() == "KKKKKKK"
+    assert records[1].tags() == b""
     assert records[2].name() == "YetAnotherHeader/1"
     assert records[2].sequence() == "AAAATTTT"
     assert records[2].qualities() == "XKLLCCCC"
+    assert records[2].tags() == b""
     assert len(records) == 3
 
 
diff --git a/tests/test_nano_stats.py b/tests/test_nano_stats.py
index e63e663..c99230b 100644
--- a/tests/test_nano_stats.py
+++ b/tests/test_nano_stats.py
@@ -15,12 +15,13 @@
 # along with Sequali.  If not, see <https://www.gnu.org/licenses/
 
 import datetime
+import struct
 
 from sequali import FastqRecordView
 from sequali._qc import NanoStats
 
 
-def test_nano_stats():
+def test_nano_stats_from_header():
 
     view = FastqRecordView("cb1dab45-aa4c-43fc-a91e-ad0ecc92f5c9 "
                            "runid=c989c681b782549923cb0a02c95f6ec9d2534335 "
@@ -45,5 +46,48 @@ def test_nano_stats():
     info = nano_info_list[0]
     assert info.start_time == timestamp
     assert info.channel_id == 444
-    assert info.length == 4
+    assert info.length == len(view.sequence())
     assert info.cumulative_error_rate == cumulative_error_rate
+    assert info.duration == 0.0
+
+
+def test_nano_stats_from_tags():
+    cumulative_error_rate = 4 * 10 ** (-(ord("A") - 33) / 10)
+    read_number = 10
+    channel_id = 444
+    duration = 2.5
+    timestamp_string = b"2021-09-30T11:34:08Z\x00"
+    readgroup_string = b"SS_A1\x00"
+    tags = struct.pack(
+        f"<"
+        f"3sB"
+        f"3sH"
+        f"3s{len(timestamp_string)}s"
+        f"3s{len(readgroup_string)}s"
+        f"3sf",
+        b"rnC", read_number,
+        b"chS", channel_id,
+        b"stZ", timestamp_string,
+        b"RGZ", readgroup_string,
+        b"duf", duration,
+    )
+    view = FastqRecordView("cb1dab45-aa4c-43fc-a91e-ad0ecc92f5c9",
+                           "ACGT",
+                           "AAAA",
+                           tags)
+    print(tags)
+    tm = datetime.datetime.fromisoformat("2021-09-30T11:34:08")
+    tm = tm.replace(tzinfo=datetime.timezone.utc)
+    timestamp = tm.timestamp()
+    nanostats = NanoStats()
+    nanostats.add_read(view)
+    assert nanostats.minimum_time == timestamp
+    assert nanostats.maximum_time == timestamp
+    nano_info_list = list(nanostats.nano_info_iterator())
+    assert len(nano_info_list) == 1
+    info = nano_info_list[0]
+    assert info.start_time == timestamp
+    assert info.channel_id == 444
+    assert info.length == len(view.sequence())
+    assert info.cumulative_error_rate == cumulative_error_rate
+    assert info.duration == duration

From 3019b99ffb030735dc8c19a17de48bc45d162ec2 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 3 Dec 2024 13:00:10 +0100
Subject: [PATCH 2/3] Do not use internal FASTQ representation anymore

---
 src/sequali/_qcmodule.c | 180 +++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 103 deletions(-)

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index fd2109f..63f7ac7 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -335,8 +335,10 @@ time_string_to_timestamp(const uint8_t *time_string)
 */
 
 struct FastqMeta {
-    uint8_t *record_start;
-    // name_offset is always 1, so no variable needed
+    union {
+        uint8_t *record_start;
+        uint8_t *name;  // Name is always at the start of the record.
+    };
     uint32_t name_length;
     uint32_t sequence_offset;
     // Sequence length and qualities length should be the same
@@ -427,8 +429,7 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
         tags_length = PyBytes_Size(tags_obj);
         tags = PyBytes_AsString(tags_obj);
     }
-    size_t total_length =
-        name_length + sequence_length + qualities_length + 6 + tags_length;
+    size_t total_length = name_length + sequence_length * 2 + tags_length;
     if (total_length > UINT32_MAX) {
         // lengths are saved as uint32_t types so throw an error;
         PyErr_Format(
@@ -462,31 +463,20 @@ FastqRecordView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     uint8_t *buffer = (uint8_t *)PyBytes_AsString(bytes_obj);  // No check needed.
     self->meta.record_start = buffer;
     self->meta.name_length = name_length;
-    self->meta.sequence_offset = 2 + name_length;
+    self->meta.sequence_offset = name_length;
     self->meta.sequence_length = sequence_length;
-    self->meta.qualities_offset = 5 + name_length + sequence_length;
-    self->meta.tags_offset = name_length + sequence_length * 2 + 6;
+    self->meta.qualities_offset = name_length + sequence_length;
+    self->meta.tags_offset = name_length + sequence_length * 2;
     self->meta.tags_length = tags_length;
     self->meta.accumulated_error_rate = accumulated_error_rate;
     self->obj = bytes_obj;
 
-    buffer[0] = '@';
-    memcpy(buffer + 1, name, name_length);
-    size_t cursor = 1 + name_length;
-    buffer[cursor] = '\n';
-    cursor += 1;
+    memcpy(buffer, name, name_length);
+    size_t cursor = name_length;
     memcpy(buffer + cursor, sequence, sequence_length);
     cursor += sequence_length;
-    buffer[cursor] = '\n';
-    cursor += 1;
-    buffer[cursor] = '+';
-    cursor += 1;
-    buffer[cursor] = '\n';
-    cursor += 1;
     memcpy(buffer + cursor, qualities, sequence_length);
     cursor += sequence_length;
-    buffer[cursor] = '\n';
-    cursor += 1;
     memcpy(buffer + cursor, tags, tags_length);
     return (PyObject *)self;
 }
@@ -500,7 +490,7 @@ PyDoc_STRVAR(FastqRecordView_name__doc__,
 static PyObject *
 FastqRecordView_name(FastqRecordView *self, PyObject *Py_UNUSED(ignore))
 {
-    return PyUnicode_DecodeASCII((char *)self->meta.record_start + 1,
+    return PyUnicode_DecodeASCII((char *)self->meta.name,
                                  self->meta.name_length, NULL);
 }
 
@@ -656,7 +646,7 @@ FastqRecordArrayView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs
             return NULL;
         }
         FastqRecordView *record = (FastqRecordView *)item;
-        size_t memory_size = 6 + record->meta.name_length +
+        size_t memory_size = record->meta.name_length +
                              record->meta.sequence_length * 2 +
                              record->meta.tags_length;
         total_memory_size += memory_size;
@@ -681,24 +671,14 @@ FastqRecordArrayView__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs
         FastqRecordView *record =
             (FastqRecordView *)PySequence_GetItem(view_fastseq, i);
         struct FastqMeta meta = record->meta;
-        record_ptr[0] = '@';
-        record_ptr += 1;
-        memcpy(record_ptr, meta.record_start + 1, meta.name_length);
+        memcpy(record_ptr, meta.record_start, meta.name_length);
         record_ptr += meta.name_length;
-        record_ptr[0] = '\n';
-        record_ptr += 1;
         memcpy(record_ptr, meta.record_start + meta.sequence_offset,
                meta.sequence_length);
         record_ptr += meta.sequence_length;
-        record_ptr[0] = '\n';
-        record_ptr[1] = '+';
-        record_ptr[2] = '\n';
-        record_ptr += 3;
         memcpy(record_ptr, meta.record_start + meta.qualities_offset,
                meta.sequence_length);
         record_ptr += meta.sequence_length;
-        record_ptr[0] = '\n';
-        record_ptr += 1;
         memcpy(record_ptr, meta.record_start + meta.tags_offset, meta.tags_length);
         record_ptr += meta.tags_length;
         memcpy(metas + i, &record->meta, sizeof(struct FastqMeta));
@@ -751,17 +731,21 @@ FastqRecordArrayView__length__(FastqRecordArrayView *self)
  * @param name2_length The length of the second name
  */
 static inline bool
-fastq_names_are_mates(const char *name1, const char *name2, size_t name2_length)
+fastq_names_are_mates(const char *name1, const char *name2,
+                      size_t name1_length, size_t name2_length)
 {
-    /* strcspn will return the length of the string if none of the complement
-       characters is found, so name1_length is not necessary as a parameter.*/
-    size_t id_length = strcspn(name1, " \t\n");
+    size_t id_length = 0;
+    for (; id_length < name1_length; id_length++) {
+        if (name1[id_length] == ' ' || name1[id_length] == '\t') {
+            break;
+        }
+    }
     if (name2_length < id_length) {
         return false;
     }
     if (name2_length > id_length) {
         char id2_sep = name2[id_length];
-        if (!(id2_sep == ' ' || id2_sep == '\t' || id2_sep == '\n')) {
+        if (!(id2_sep == ' ' || id2_sep == '\t')) {
             return false;
         }
     }
@@ -815,10 +799,11 @@ FastqRecordArrayView_is_mate(FastqRecordArrayView *self, PyObject *other_obj)
     for (Py_ssize_t i = 0; i < length; i++) {
         struct FastqMeta *record1 = self_records + i;
         struct FastqMeta *record2 = other_records + i;
-        char *name1 = (char *)record1->record_start + 1;
-        char *name2 = (char *)record2->record_start + 1;
+        char *name1 = (char *)record1->name;
+        char *name2 = (char *)record2->name;
+        size_t name1_length = record1->name_length;
         size_t name2_length = record2->name_length;
-        if (!fastq_names_are_mates(name1, name2, name2_length)) {
+        if (!fastq_names_are_mates(name1, name2, name1_length, name2_length)) {
             Py_RETURN_FALSE;
         }
     }
@@ -1085,12 +1070,13 @@ FastqParser_create_record_array(FastqParser *self, size_t min_records,
                 Py_DECREF(new_buffer_obj);
                 return NULL;
             }
+            uint8_t *name_start = record_start + 1;
             uint8_t *name_end =
-                memchr(record_start, '\n', buffer_end - record_start);
+                memchr(name_start, '\n', buffer_end - record_start);
             if (name_end == NULL) {
                 break;
             }
-            size_t name_length = name_end - (record_start + 1);
+            size_t name_length = name_end - name_start;
             uint8_t *sequence_start = name_end + 1;
             uint8_t *sequence_end =
                 memchr(sequence_start, '\n', buffer_end - sequence_start);
@@ -1121,8 +1107,8 @@ FastqParser_create_record_array(FastqParser *self, size_t min_records,
             }
             size_t qualities_length = qualities_end - qualities_start;
             if (sequence_length != qualities_length) {
-                PyObject *record_name_obj = PyUnicode_DecodeASCII(
-                    (char *)record_start + 1, name_length, NULL);
+                PyObject *record_name_obj =
+                    PyUnicode_DecodeASCII((char *)name_start, name_length, NULL);
                 PyErr_Format(PyExc_ValueError,
                              "Record sequence and qualities do not have equal "
                              "length, %R",
@@ -1142,12 +1128,12 @@ FastqParser_create_record_array(FastqParser *self, size_t min_records,
                 self->meta_buffer_size = parsed_records;
             }
             struct FastqMeta *meta = self->meta_buffer + (parsed_records - 1);
-            meta->record_start = record_start;
+            meta->record_start = name_start;
             meta->name_length = name_length;
-            meta->sequence_offset = sequence_start - record_start;
+            meta->sequence_offset = sequence_start - name_start;
             meta->sequence_length = sequence_length;
-            meta->qualities_offset = qualities_start - record_start;
-            meta->tags_offset = qualities_end + 1 - record_start;
+            meta->qualities_offset = qualities_start - name_start;
+            meta->tags_offset = qualities_end - name_start;
             meta->tags_length = 0;
             meta->accumulated_error_rate = 0.0;
             record_start = qualities_end + 1;
@@ -1406,7 +1392,7 @@ BamParser__next__(BamParser *self)
     record_start = self->read_in_buffer;
     buffer_end = record_start + leftover_size;
     size_t parsed_records = 0;
-    PyObject *fastq_buffer_obj = NULL;
+    PyObject *read_data_obj = NULL;
     struct QCModuleState *state = get_qc_module_state_from_obj(self);
     if (state == NULL) {
         return NULL;
@@ -1431,7 +1417,7 @@ BamParser__next__(BamParser *self)
             uint8_t *tmp_read_in_buffer =
                 PyMem_Realloc(self->read_in_buffer, minimum_space_required);
             if (tmp_read_in_buffer == NULL) {
-                Py_XDECREF(fastq_buffer_obj);
+                Py_XDECREF(read_data_obj);
                 return PyErr_NoMemory();
             }
             self->read_in_buffer = tmp_read_in_buffer;
@@ -1447,7 +1433,7 @@ BamParser__next__(BamParser *self)
             PyObject_CallMethod(self->file_obj, "readinto", "O", buffer_view);
         Py_DECREF(buffer_view);
         if (read_bytes_obj == NULL) {
-            Py_XDECREF(fastq_buffer_obj);
+            Py_XDECREF(read_data_obj);
             return NULL;
         }
         Py_ssize_t read_bytes = PyLong_AsSsize_t(read_bytes_obj);
@@ -1456,7 +1442,7 @@ BamParser__next__(BamParser *self)
         if (new_buffer_size == 0) {
             // Entire file is read
             PyErr_SetNone(PyExc_StopIteration);
-            Py_XDECREF(fastq_buffer_obj);
+            Py_XDECREF(read_data_obj);
             return NULL;
         }
         else if (read_bytes == 0) {
@@ -1466,7 +1452,7 @@ BamParser__next__(BamParser *self)
                          "Incomplete record at the end of file %R",
                          remaining_obj);
             Py_DECREF(remaining_obj);
-            Py_XDECREF(fastq_buffer_obj);
+            Py_XDECREF(read_data_obj);
             return NULL;
         }
 
@@ -1480,28 +1466,28 @@ BamParser__next__(BamParser *self)
            of the bam record. Quality always maps one to one, but sequence is
            compressed and maps one to two. So that is a 3:4 ratio for BAM:FASTQ.
         */
-        Py_ssize_t fastq_buffer_size = (new_buffer_size * 4 + 2) / 3;
-        if (fastq_buffer_obj == NULL) {
-            fastq_buffer_obj = PyBytes_FromStringAndSize(NULL, fastq_buffer_size);
-            if (fastq_buffer_obj == NULL) {
-                Py_XDECREF(fastq_buffer_obj);
+        Py_ssize_t read_data_size = (new_buffer_size * 4 + 2) / 3;
+        if (read_data_obj == NULL) {
+            read_data_obj = PyBytes_FromStringAndSize(NULL, read_data_size);
+            if (read_data_obj == NULL) {
+                Py_XDECREF(read_data_obj);
                 return PyErr_NoMemory();
             }
         }
         else {
-            PyObject *old_fastq_buffer_obj = fastq_buffer_obj;
-            fastq_buffer_obj = PyBytes_FromStringAndSize(NULL, fastq_buffer_size);
-            if (fastq_buffer_obj == NULL) {
-                Py_DECREF(old_fastq_buffer_obj);
+            PyObject *old_read_data_obj = read_data_obj;
+            read_data_obj = PyBytes_FromStringAndSize(NULL, read_data_size);
+            if (read_data_obj == NULL) {
+                Py_DECREF(old_read_data_obj);
                 return NULL;
             }
-            memcpy(PyBytes_AsString(fastq_buffer_obj),
-                   PyBytes_AsString(old_fastq_buffer_obj),
-                   PyBytes_Size(old_fastq_buffer_obj));
-            Py_DECREF(old_fastq_buffer_obj);
+            memcpy(PyBytes_AsString(read_data_obj),
+                   PyBytes_AsString(old_read_data_obj),
+                   PyBytes_Size(old_read_data_obj));
+            Py_DECREF(old_read_data_obj);
         }
-        uint8_t *fastq_buffer_record_start =
-            (uint8_t *)PyBytes_AsString(fastq_buffer_obj);
+        uint8_t *read_data_record_start =
+            (uint8_t *)PyBytes_AsString(read_data_obj);
 
         while (1) {
             if (record_start + 4 >= buffer_end) {
@@ -1526,39 +1512,29 @@ BamParser__next__(BamParser *self)
             uint32_t seq_length = header->l_seq;
             uint32_t encoded_seq_length = (seq_length + 1) / 2;
             uint8_t *bam_qual_start = bam_seq_start + encoded_seq_length;
-            fastq_buffer_record_start[0] = '@';
-            uint8_t *fastq_buffer_cursor = fastq_buffer_record_start + 1;
             uint8_t *tag_start = bam_qual_start + seq_length;
             size_t tags_length = record_end - tag_start;
+
+            uint8_t *read_data_cursor = read_data_record_start;
             if (name_length > 0) {
                 name_length -= 1; /* Includes terminating NULL byte */
-                memcpy(fastq_buffer_cursor, bam_name_start, name_length);
+                memcpy(read_data_cursor, bam_name_start, name_length);
             }
-            /* The + and newlines are unncessary, but also do not require much
-               space and compute time. So keep the in-memory presentation as
-               a FASTQ record for homogeneity with FASTQ input. */
-            fastq_buffer_cursor += name_length;
-            fastq_buffer_cursor[0] = '\n';
-            fastq_buffer_cursor += 1;
-            decode_bam_sequence(fastq_buffer_cursor, bam_seq_start, seq_length);
-            fastq_buffer_cursor += seq_length;
-            memcpy(fastq_buffer_cursor, "\n+\n", 3);
-            fastq_buffer_cursor += 3;
+            read_data_cursor += name_length;
+            decode_bam_sequence(read_data_cursor, bam_seq_start, seq_length);
+            read_data_cursor += seq_length;
             if (seq_length && bam_qual_start[0] == 0xff) {
                 /* If qualities are missing, all bases are set to 0xff, which
                    is an invalid phred value. Create a quality string with only
                    zero Phreds for a valid FASTQ representation */
-                memset(fastq_buffer_cursor, 33, seq_length);
+                memset(read_data_cursor, 33, seq_length);
             }
             else {
-                decode_bam_qualities(fastq_buffer_cursor, bam_qual_start,
-                                     seq_length);
+                decode_bam_qualities(read_data_cursor, bam_qual_start, seq_length);
             }
-            fastq_buffer_cursor += seq_length;
-            fastq_buffer_cursor[0] = '\n';
-            fastq_buffer_cursor += 1;
-            memcpy(fastq_buffer_cursor, tag_start, tags_length);
-            fastq_buffer_cursor += tags_length;
+            read_data_cursor += seq_length;
+            memcpy(read_data_cursor, tag_start, tags_length);
+            read_data_cursor += tags_length;
             parsed_records += 1;
             if (parsed_records > self->meta_buffer_size) {
                 struct FastqMeta *tmp = PyMem_Realloc(
@@ -1570,11 +1546,10 @@ BamParser__next__(BamParser *self)
                 self->meta_buffer_size = parsed_records;
             }
             struct FastqMeta *meta = self->meta_buffer + (parsed_records - 1);
-            uint32_t sequence_offset = 1 + name_length + 1;  // For '@' and '\n'
-            uint32_t qualities_offset =
-                sequence_offset + seq_length + 3;  // for '\n+\n'
-            uint32_t tags_offset = qualities_offset + seq_length + 1;
-            meta->record_start = fastq_buffer_record_start;
+            uint32_t sequence_offset = name_length;
+            uint32_t qualities_offset = sequence_offset + seq_length;
+            uint32_t tags_offset = qualities_offset + seq_length;
+            meta->record_start = read_data_record_start;
             meta->name_length = name_length;
             meta->sequence_offset = sequence_offset;
             meta->sequence_length = seq_length;
@@ -1583,15 +1558,15 @@ BamParser__next__(BamParser *self)
             meta->tags_length = tags_length;
             meta->accumulated_error_rate = 0.0;
             record_start = record_end;
-            fastq_buffer_record_start = fastq_buffer_cursor;
+            read_data_record_start = read_data_cursor;
         }
     }
     self->record_start = record_start;
     self->buffer_end = buffer_end;
     PyObject *record_array = FastqRecordArrayView_FromPointerSizeAndObject(
-        self->meta_buffer, parsed_records, fastq_buffer_obj,
+        self->meta_buffer, parsed_records, read_data_obj,
         FastqRecordArrayView_Type);
-    Py_DECREF(fastq_buffer_obj);
+    Py_DECREF(read_data_obj);
     return record_array;
 }
 
@@ -3000,7 +2975,7 @@ PerTileQuality_add_meta(PerTileQuality *self, struct FastqMeta *meta)
         return 0;
     }
     uint8_t *record_start = meta->record_start;
-    const uint8_t *header = record_start + 1;
+    const uint8_t *header = record_start;
     size_t header_length = meta->name_length;
     const uint8_t *qualities = record_start + meta->qualities_offset;
     size_t sequence_length = meta->sequence_length;
@@ -4832,10 +4807,9 @@ NanoStats_add_meta(NanoStats *self, struct FastqMeta *meta)
             return -1;
         }
     }
-    else if (NanoInfo_from_header(meta->record_start + 1, meta->name_length,
-                                  info) != 0) {
-        PyObject *header_obj = PyUnicode_DecodeASCII(
-            (const char *)meta->record_start + 1, meta->name_length, NULL);
+    else if (NanoInfo_from_header(meta->name, meta->name_length, info) != 0) {
+        PyObject *header_obj = PyUnicode_DecodeASCII((const char *)meta->name,
+                                                     meta->name_length, NULL);
         if (header_obj == NULL) {
             return -1;
         }

From b284b5bedd1fe5546868c6a7286a388c29ec6dbe Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Thu, 5 Dec 2024 15:37:31 +0100
Subject: [PATCH 3/3] Faster mate check

---
 src/sequali/_qcmodule.c | 52 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
index 63f7ac7..a6283b8 100644
--- a/src/sequali/_qcmodule.c
+++ b/src/sequali/_qcmodule.c
@@ -719,6 +719,51 @@ FastqRecordArrayView__length__(FastqRecordArrayView *self)
     return Py_SIZE((PyObject *)self);
 }
 
+static inline bool
+is_space(char c)
+{
+    return (c == ' ' || c == '\t');
+}
+
+#define FIND_SPACE_CHUNK_SIZE 8
+/**
+ * @brief strcspn(str, " \t") replacement with length. Returns the offset of
+ * ' ' or '\t' or the length of the string.
+ *
+ * @param str
+ * @param length
+ * @return size_t
+ */
+static inline size_t
+find_space(const char *restrict str, size_t length)
+{
+    const char *restrict cursor = str;
+    const char *end = str + length;
+    const char *vec_end = end - (FIND_SPACE_CHUNK_SIZE - 1);
+    while (cursor < vec_end) {
+        /* Fixed size for loop allows compiler to use inline vectors. */
+        uint8_t results[FIND_SPACE_CHUNK_SIZE];
+        for (size_t i = 0; i < FIND_SPACE_CHUNK_SIZE; i++) {
+            /* Set all bits when is_space is true. This is the same result as
+               when _mm_cmpeq_epi8 is used. Hence an extra AND instruction is
+               prevented. */
+            results[i] = is_space(cursor[i]) ? 255 : 0;
+        }
+        uint64_t *result = (uint64_t *)results;
+        if (result[0]) {
+            break;
+        }
+        cursor += 8;
+    }
+    while (cursor < end) {
+        if (is_space(*cursor)) {
+            break;
+        }
+        cursor++;
+    }
+    return cursor - str;
+}
+
 /**
  * @brief Compare two FASTQ record names to see if they are mates.
  *
@@ -734,12 +779,7 @@ static inline bool
 fastq_names_are_mates(const char *name1, const char *name2,
                       size_t name1_length, size_t name2_length)
 {
-    size_t id_length = 0;
-    for (; id_length < name1_length; id_length++) {
-        if (name1[id_length] == ' ' || name1[id_length] == '\t') {
-            break;
-        }
-    }
+    size_t id_length = find_space(name1, name1_length);
     if (name2_length < id_length) {
         return false;
     }