From 187a6d236cc1d311b8a6062eebd9c33ee644fed7 Mon Sep 17 00:00:00 2001
From: Daniil Porokhnin <dkharmsd@gmail.com>
Date: Fri, 27 Feb 2026 20:53:08 +0300
Subject: [PATCH 1/2] feat: initial impementation of vectorized processing

---
 tokenizer/text.c            | 123 ++++++++++++++++++++++++++++++++++++
 tokenizer/text.h            |  16 +++++
 tokenizer/text_tokenizer.go |  44 +++++++++++++
 3 files changed, 183 insertions(+)
 create mode 100644 tokenizer/text.c
 create mode 100644 tokenizer/text.h

diff --git a/tokenizer/text.c b/tokenizer/text.c
new file mode 100644
index 00000000..5b986a88
--- /dev/null
+++ b/tokenizer/text.c
@@ -0,0 +1,123 @@
+#include "assert.h"
+#include "emmintrin.h"
+#include "immintrin.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+
+#include "text.h"
+
+static inline uint16_t eq_mask(__m128i block, char c) {
+  return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c)));
+}
+
+static inline int is_token_char(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
+         ('0' <= c && c <= '9') || c == '_' || c == '*';
+}
+
+int32_t asciionly(const char *data, size_t len) {
+  char high = 0x80;
+  __m128i mask = _mm_set1_epi8(high);
+
+  size_t i;
+  int32_t result = 1;
+  for (i = 0; i + 16 < len; i += 16) {
+    __m128i input = _mm_lddqu_si128((__m128i_u *)(data + i));
+    __m128i masked = _mm_and_si128(input, mask);
+
+    result &= (_mm_movemask_epi8(masked) == 0);
+    if (!result)
+      return 0;
+  }
+
+  for (; i < len; i++)
+    result &= ((data[i] & high) == 0);
+
+  return result;
+}
+
+int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) {
+  __m128i block_lo = _mm_set1_epi8(lo);
+  __m128i block_zero = _mm_set1_epi8((char)0x0);
+  __m128i block_range = _mm_set1_epi8(hi - lo);
+
+  __m128i_u res = _mm_sub_epi8(block, block_lo);
+  res = _mm_subs_epu8(res, block_range);
+
+  return _mm_movemask_epi8(_mm_cmpeq_epi8(res, block_zero));
+}
+
+int32_t tokenize(const char *text, size_t len, span *out, int out_cap) {
+  int count = 0;
+  int token_start = -1;
+
+  size_t i;
+  for (i = 0; i + 16 <= len; i += 16) {
+    __m128i block = _mm_lddqu_si128((__m128i_u *)(text + i));
+
+    // I need to check PSHUFB approach.
+    // Seems like there is so much overhead in here.
+    uint16_t bitmap_token =
+        boundaries(block, 'a', 'z') | boundaries(block, 'A', 'Z') |
+        boundaries(block, '0', '9') | eq_mask(block, '_') | eq_mask(block, '*');
+    uint16_t bitmap_delimeters = ~bitmap_token;
+
+    if (token_start == -1) {
+      // Whole block of 16 bytes contains no text symbols.
+      if (bitmap_token == 0)
+        continue;
+      token_start = i + __builtin_ctz(bitmap_token);
+    }
+
+    // Whole block of 16 bytes contains text symbols.
+    if (bitmap_delimeters == 0)
+      continue;
+
+    while (bitmap_delimeters && count < out_cap) {
+      int pos = i + __builtin_ctz(bitmap_delimeters);
+
+      if (token_start != -1 && pos > token_start) {
+        out[count++] = (span){
+            .start = token_start,
+            .len = pos - token_start,
+        };
+        token_start = -1;
+      }
+
+      bitmap_delimeters &= bitmap_delimeters - 1;
+      if (token_start == -1 && bitmap_token) {
+        int bit = pos - i;
+        uint16_t remaining = bitmap_token & ~((1 << (bit + 1)) - 1);
+        if (remaining)
+          token_start = i + __builtin_ctz(remaining);
+      }
+    }
+  }
+
+  for (size_t j = i; j < len; j++) {
+    if (is_token_char(text[j])) {
+      if (token_start == -1)
+        token_start = j;
+      continue;
+    }
+
+    if (token_start != -1 && count < out_cap) {
+      out[count++] = (span){
+          .start = token_start,
+          .len = j - token_start,
+      };
+      token_start = -1;
+    }
+  }
+
+  if (token_start != -1 && count < out_cap) {
+    out[count++] = (span){
+        .start = token_start,
+        .len = len - token_start,
+    };
+  }
+
+  return count;
+}
diff --git a/tokenizer/text.h b/tokenizer/text.h
new file mode 100644
index 00000000..6560f6f2
--- /dev/null
+++ b/tokenizer/text.h
@@ -0,0 +1,16 @@
+#include "assert.h"
+#include "emmintrin.h"
+#include "immintrin.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+
+typedef struct {
+  uint32_t start;
+  uint16_t len;
+} span;
+
+int32_t asciionly(const char *data, size_t len);
+int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi);
+int32_t tokenize(const char *text, size_t len, span *out, int out_cap);
diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go
index 6a728a91..dba28f2f 100644
--- a/tokenizer/text_tokenizer.go
+++ b/tokenizer/text_tokenizer.go
@@ -1,8 +1,15 @@
 package tokenizer
 
+/*
+#cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra
+#include "text.h"
+*/
+import "C"
+
 import (
 	"unicode"
 	"unicode/utf8"
+	"unsafe"
 
 	"github.com/ozontech/seq-db/metric"
 )
@@ -47,6 +54,18 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel
 	value = value[:maxLength]
 	k := 0
 
+	if asciiOnly(value) {
+		spans := tokenize(value)
+
+		for _, s := range spans {
+			start, length := uint32(s.start), uint32(s.len)
+			token := value[start : start+length]
+			tokens = append(tokens, MetaToken{Key: name, Value: token})
+		}
+
+		return tokens
+	}
+
 	hasUpper := false
 	asciiOnly := true
 	// Loop over the string looking for tokens.
@@ -103,3 +122,28 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel
 
 	return tokens
 }
+
+func asciiOnly(s []byte) bool {
+	return int32(C.asciionly(
+		(*C.char)(unsafe.Pointer(unsafe.SliceData(s))),
+		C.size_t(len(s)),
+	)) == 1
+}
+
+func tokenize(text []byte) []C.span {
+	if len(text) == 0 {
+		return nil
+	}
+
+	cap := len(text)/2 + 1
+	buf := make([]C.span, cap)
+
+	n := C.tokenize(
+		(*C.char)(unsafe.Pointer(&text[0])),
+		C.size_t(len(text)),
+		&buf[0],
+		C.int(cap),
+	)
+
+	return buf[:n]
+}

From e1d18b5e43dcb50bd1ae5e10f980f308a3df4195 Mon Sep 17 00:00:00 2001
From: Daniil Porokhnin <dkharmsd@gmail.com>
Date: Sat, 28 Feb 2026 12:58:10 +0300
Subject: [PATCH 2/2] perf: reduce allocations

---
 tokenizer/text_tokenizer.go      | 40 ++++++++++++++++++++++++--------
 tokenizer/text_tokenizer_test.go | 26 +++++++++++++++++++++
 tokenizer/{text.c => tokenize.c} | 11 +++++----
 tokenizer/{text.h => tokenize.h} |  0
 4 files changed, 63 insertions(+), 14 deletions(-)
 rename tokenizer/{text.c => tokenize.c} (93%)
 rename tokenizer/{text.h => tokenize.h} (100%)

diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go
index dba28f2f..14ce4a6b 100644
--- a/tokenizer/text_tokenizer.go
+++ b/tokenizer/text_tokenizer.go
@@ -2,11 +2,12 @@ package tokenizer
 
 /*
 #cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra
-#include "text.h"
+#include "tokenize.h"
 */
 import "C"
 
 import (
+	"sync"
 	"unicode"
 	"unicode/utf8"
 	"unsafe"
@@ -14,6 +15,13 @@ import (
 	"github.com/ozontech/seq-db/metric"
 )
 
+var spanBufPool = sync.Pool{
+	New: func() any {
+		buf := make([]C.span, 64)
+		return &buf
+	},
+}
+
 type TextTokenizer struct {
 	maxTokenSize               int
 	caseSensitive              bool
@@ -54,18 +62,22 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel
 	value = value[:maxLength]
 	k := 0
 
-	if asciiOnly(value) {
-		spans := tokenize(value)
+	bufp := spanBufPool.Get().(*[]C.span)
+	spans, ok := tokenize(value, *bufp)
 
+	if ok {
 		for _, s := range spans {
 			start, length := uint32(s.start), uint32(s.len)
 			token := value[start : start+length]
 			tokens = append(tokens, MetaToken{Key: name, Value: token})
 		}
-
+		*bufp = spans[:cap(spans)]
+		spanBufPool.Put(bufp)
 		return tokens
 	}
 
+	panic("unreachable")
+
 	hasUpper := false
 	asciiOnly := true
 	// Loop over the string looking for tokens.
@@ -130,20 +142,28 @@ func asciiOnly(s []byte) bool {
 	)) == 1
 }
 
-func tokenize(text []byte) []C.span {
+func tokenize(text []byte, buf []C.span) ([]C.span, bool) {
 	if len(text) == 0 {
-		return nil
+		return buf[:0], true
 	}
 
-	cap := len(text)/2 + 1
-	buf := make([]C.span, cap)
+	required := len(text)/2 + 1
+	if cap(buf) < required {
+		buf = make([]C.span, required)
+	} else {
+		buf = buf[:required]
+	}
 
 	n := C.tokenize(
 		(*C.char)(unsafe.Pointer(&text[0])),
 		C.size_t(len(text)),
 		&buf[0],
-		C.int(cap),
+		+C.int(required),
 	)
 
-	return buf[:n]
+	if n < 0 {
+		return nil, false
+	}
+
+	return buf[:n], true
 }
diff --git a/tokenizer/text_tokenizer_test.go b/tokenizer/text_tokenizer_test.go
index b56c82b6..e4a8be99 100644
--- a/tokenizer/text_tokenizer_test.go
+++ b/tokenizer/text_tokenizer_test.go
@@ -187,3 +187,29 @@ func TestTextTokenizerUTF8(t *testing.T) {
 	test("пРивеt世界", []string{"пРивеt世界"})
 	test("А", []string{"А"})
 }
+
+func BenchmarkTokenize(b *testing.B) {
+	tokenizer := NewTextTokenizer(1000, false, true, 1024)
+	name := []byte("message")
+
+	short := []byte("GET /api/v1/users 200 OK")
+	medium := []byte("2025-02-27T10:15:30Z INFO worker_3 processed request from 192.168.1.42 method=POST path=/api/v1/orders status=201 latency_ms=12 bytes=4096")
+	long := bytes.Repeat([]byte("connection_timeout from host=server42 region=eu_west error_code=ETIMEDOUT retry_count=3 "), 10)
+
+	for _, tc := range []struct {
+		name string
+		data []byte
+	}{
+		{"short_24B", short},
+		{"medium_150B", medium},
+		{"long_900B", long},
+	} {
+		b.Run(tc.name, func(b *testing.B) {
+			b.SetBytes(int64(len(tc.data)))
+			var tokens []MetaToken
+			for b.Loop() {
+				tokens = tokenizer.Tokenize(tokens[:0], name, tc.data, 0)
+			}
+		})
+	}
+}
diff --git a/tokenizer/text.c b/tokenizer/tokenize.c
similarity index 93%
rename from tokenizer/text.c
rename to tokenizer/tokenize.c
index 5b986a88..4a3ed639 100644
--- a/tokenizer/text.c
+++ b/tokenizer/tokenize.c
@@ -6,7 +6,7 @@
 #include "stdlib.h"
 #include "string.h"
 
-#include "text.h"
+#include "tokenize.h"
 
 static inline uint16_t eq_mask(__m128i block, char c) {
   return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c)));
@@ -17,14 +17,14 @@ static inline int is_token_char(char c) {
          ('0' <= c && c <= '9') || c == '_' || c == '*';
 }
 
-int32_t asciionly(const char *data, size_t len) {
+int32_t asciionly(const char *text, size_t len) {
   char high = 0x80;
   __m128i mask = _mm_set1_epi8(high);
 
   size_t i;
   int32_t result = 1;
   for (i = 0; i + 16 < len; i += 16) {
-    __m128i input = _mm_lddqu_si128((__m128i_u *)(data + i));
+    __m128i input = _mm_lddqu_si128((__m128i_u *)(text + i));
     __m128i masked = _mm_and_si128(input, mask);
 
     result &= (_mm_movemask_epi8(masked) == 0);
@@ -33,7 +33,7 @@ int32_t asciionly(const char *data, size_t len) {
   }
 
   for (; i < len; i++)
-    result &= ((data[i] & high) == 0);
+    result &= ((text[i] & high) == 0);
 
   return result;
 }
@@ -50,6 +50,9 @@ int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) {
 }
 
 int32_t tokenize(const char *text, size_t len, span *out, int out_cap) {
+  if (!asciionly(text, len))
+    return -1;
+
   int count = 0;
   int token_start = -1;
 
diff --git a/tokenizer/text.h b/tokenizer/tokenize.h
similarity index 100%
rename from tokenizer/text.h
rename to tokenizer/tokenize.h