ozontech · dkharms · Feb 27, 2026 · Feb 28, 2026
diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go
@@ -1,12 +1,27 @@
 package tokenizer
 
+/*
+#cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra
+#include "tokenize.h"
+*/
+import "C"
+
 import (
+	"sync"
 	"unicode"
 	"unicode/utf8"
+	"unsafe"
 
 	"github.com/ozontech/seq-db/metric"
 )
 
+var spanBufPool = sync.Pool{
+	New: func() any {
+		buf := make([]C.span, 64)
+		return &buf
+	},
+}
+
 type TextTokenizer struct {
 	maxTokenSize               int
 	caseSensitive              bool
@@ -47,7 +62,23 @@
 	value = value[:maxLength]
 	k := 0
 
+	bufp := spanBufPool.Get().(*[]C.span)
+	spans, ok := tokenize(value, *bufp)
+
+	if ok {
+		for _, s := range spans {
+			start, length := uint32(s.start), uint32(s.len)
+			token := value[start : start+length]
+			tokens = append(tokens, MetaToken{Key: name, Value: token})
+		}
+		*bufp = spans[:cap(spans)]
+		spanBufPool.Put(bufp)
+		return tokens
+	}
+
+	panic("unreachable")
+
 	hasUpper := false
 	asciiOnly := true
 	// Loop over the string looking for tokens.
 	// Token of TextTokenizer is a string that contains only letters, numbers, '*' or '_'.
@@ -103,3 +134,36 @@
 
 	return tokens
 }
+
+func asciiOnly(s []byte) bool {
+	return int32(C.asciionly(
+		(*C.char)(unsafe.Pointer(unsafe.SliceData(s))),
+		C.size_t(len(s)),
+	)) == 1
+}
+
+func tokenize(text []byte, buf []C.span) ([]C.span, bool) {
+	if len(text) == 0 {
+		return buf[:0], true
+	}
+
+	required := len(text)/2 + 1
+	if cap(buf) < required {
+		buf = make([]C.span, required)
+	} else {
+		buf = buf[:required]
+	}
+
+	n := C.tokenize(
+		(*C.char)(unsafe.Pointer(&text[0])),
+		C.size_t(len(text)),
+		&buf[0],
+		+C.int(required),
+	)
+
+	if n < 0 {
+		return nil, false
+	}
+
+	return buf[:n], true
+}
diff --git a/tokenizer/text_tokenizer_test.go b/tokenizer/text_tokenizer_test.go
@@ -14,7 +14,7 @@

 func TestTokenizeEmptyValue(t *testing.T) {
 	testCase := []byte("")
 	tokenizer := NewTextTokenizer(1000, false, true, 1024)

 	tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
 	expected := []MetaToken{newMetaToken("message", "")}
@@ -24,7 +24,7 @@

 func TestTokenizeSimple(t *testing.T) {
 	testCase := []byte("arr hello world")
 	tokenizer := NewTextTokenizer(1000, false, true, 1024)

 	tokens := tokenizer.Tokenize(nil, []byte("message"), testCase, maxTokenSizeDummy)
 	assert.Equal(t, newMetaToken("message", "arr"), tokens[0])
@@ -33,7 +33,7 @@
 }

 func TestTokenizeSimple2(t *testing.T) {
 	tokenizer := NewTextTokenizer(1000, false, true, 1024)
 	tokens := tokenizer.Tokenize(nil, []byte("message"), bytes.Clone(longDocument), maxTokenSizeDummy)

 	assert.Equal(t, newMetaToken("message", "t1"), tokens[0])
@@ -48,7 +48,7 @@

 func TestTokenizePartialDefault(t *testing.T) {
 	const maxSize = 100500
 	tokenizer := NewTextTokenizer(maxSize, false, true, maxSize)
 	testCase := []byte(strings.Repeat("1", maxSize+1))

 	tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
@@ -60,7 +60,7 @@

 func TestTokenizePartial(t *testing.T) {
 	const maxSize = 100500
 	tokenizer := NewTextTokenizer(maxSize, false, true, 0)
 	testCase := []byte(strings.Repeat("1", maxSize+1))

 	tokens := tokenizer.Tokenize(nil, []byte("message"), testCase, maxSize)
@@ -72,7 +72,7 @@

 func TestTokenizePartialSkipDefault(t *testing.T) {
 	const maxSize = 100500
 	tokenizer := NewTextTokenizer(maxSize, false, false, maxSize)
 	testCase := []byte(strings.Repeat("1", maxSize+1))

 	tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
@@ -187,3 +187,29 @@
 	test("пРивеt世界", []string{"пРивеt世界"})
 	test("А", []string{"А"})
 }
+
+func BenchmarkTokenize(b *testing.B) {
+	tokenizer := NewTextTokenizer(1000, false, true, 1024)
+	name := []byte("message")
+
+	short := []byte("GET /api/v1/users 200 OK")
+	medium := []byte("2025-02-27T10:15:30Z INFO worker_3 processed request from 192.168.1.42 method=POST path=/api/v1/orders status=201 latency_ms=12 bytes=4096")
+	long := bytes.Repeat([]byte("connection_timeout from host=server42 region=eu_west error_code=ETIMEDOUT retry_count=3 "), 10)
+
+	for _, tc := range []struct {
+		name string
+		data []byte
+	}{
+		{"short_24B", short},
+		{"medium_150B", medium},
+		{"long_900B", long},
+	} {
+		b.Run(tc.name, func(b *testing.B) {
+			b.SetBytes(int64(len(tc.data)))
+			var tokens []MetaToken
+			for b.Loop() {
+				tokens = tokenizer.Tokenize(tokens[:0], name, tc.data, 0)
+			}
+		})
+	}
+}
diff --git a/tokenizer/tokenize.c b/tokenizer/tokenize.c
@@ -0,0 +1,126 @@
+#include "assert.h"
+#include "emmintrin.h"
+#include "immintrin.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+
+#include "tokenize.h"
+
+static inline uint16_t eq_mask(__m128i block, char c) {
+  return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c)));
+}
+
+static inline int is_token_char(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
+         ('0' <= c && c <= '9') || c == '_' || c == '*';
+}
+
+int32_t asciionly(const char *text, size_t len) {
+  char high = 0x80;
+  __m128i mask = _mm_set1_epi8(high);
+
+  size_t i;
+  int32_t result = 1;
+  for (i = 0; i + 16 < len; i += 16) {
+    __m128i input = _mm_lddqu_si128((__m128i_u *)(text + i));
+    __m128i masked = _mm_and_si128(input, mask);
+
+    result &= (_mm_movemask_epi8(masked) == 0);
+    if (!result)
+      return 0;
+  }
+
+  for (; i < len; i++)
+    result &= ((text[i] & high) == 0);
+
+  return result;
+}
+
+int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) {
+  __m128i block_lo = _mm_set1_epi8(lo);
+  __m128i block_zero = _mm_set1_epi8((char)0x0);
+  __m128i block_range = _mm_set1_epi8(hi - lo);
+
+  __m128i_u res = _mm_sub_epi8(block, block_lo);
+  res = _mm_subs_epu8(res, block_range);
+
+  return _mm_movemask_epi8(_mm_cmpeq_epi8(res, block_zero));
+}
+
+int32_t tokenize(const char *text, size_t len, span *out, int out_cap) {
+  if (!asciionly(text, len))
+    return -1;
+
+  int count = 0;
+  int token_start = -1;
+
+  size_t i;
+  for (i = 0; i + 16 <= len; i += 16) {
+    __m128i block = _mm_lddqu_si128((__m128i_u *)(text + i));
+
+    // I need to check PSHUFB approach.
+    // Seems like there is so much overhead in here.
+    uint16_t bitmap_token =
+        boundaries(block, 'a', 'z') | boundaries(block, 'A', 'Z') |
+        boundaries(block, '0', '9') | eq_mask(block, '_') | eq_mask(block, '*');
+    uint16_t bitmap_delimeters = ~bitmap_token;
+
+    if (token_start == -1) {
+      // Whole block of 16 bytes contains no text symbols.
+      if (bitmap_token == 0)
+        continue;
+      token_start = i + __builtin_ctz(bitmap_token);
+    }
+
+    // Whole block of 16 bytes contains text symbols.
+    if (bitmap_delimeters == 0)
+      continue;
+
+    while (bitmap_delimeters && count < out_cap) {
+      int pos = i + __builtin_ctz(bitmap_delimeters);
+
+      if (token_start != -1 && pos > token_start) {
+        out[count++] = (span){
+            .start = token_start,
+            .len = pos - token_start,
+        };
+        token_start = -1;
+      }
+
+      bitmap_delimeters &= bitmap_delimeters - 1;
+      if (token_start == -1 && bitmap_token) {
+        int bit = pos - i;
+        uint16_t remaining = bitmap_token & ~((1 << (bit + 1)) - 1);
+        if (remaining)
+          token_start = i + __builtin_ctz(remaining);
+      }
+    }
+  }
+
+  for (size_t j = i; j < len; j++) {
+    if (is_token_char(text[j])) {
+      if (token_start == -1)
+        token_start = j;
+      continue;
+    }
+
+    if (token_start != -1 && count < out_cap) {
+      out[count++] = (span){
+          .start = token_start,
+          .len = j - token_start,
+      };
+      token_start = -1;
+    }
+  }
+
+  if (token_start != -1 && count < out_cap) {
+    out[count++] = (span){
+        .start = token_start,
+        .len = len - token_start,
+    };
+  }
+
+  return count;
+}
diff --git a/tokenizer/tokenize.h b/tokenizer/tokenize.h
@@ -0,0 +1,16 @@
+#include "assert.h"
+#include "emmintrin.h"
+#include "immintrin.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+
+typedef struct {
+  uint32_t start;
+  uint16_t len;
+} span;
+
+int32_t asciionly(const char *data, size_t len);
+int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi);
+int32_t tokenize(const char *text, size_t len, span *out, int out_cap);