From 187a6d236cc1d311b8a6062eebd9c33ee644fed7 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Fri, 27 Feb 2026 20:53:08 +0300 Subject: [PATCH 1/2] feat: initial impementation of vectorized processing --- tokenizer/text.c | 123 ++++++++++++++++++++++++++++++++++++ tokenizer/text.h | 16 +++++ tokenizer/text_tokenizer.go | 44 +++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 tokenizer/text.c create mode 100644 tokenizer/text.h diff --git a/tokenizer/text.c b/tokenizer/text.c new file mode 100644 index 00000000..5b986a88 --- /dev/null +++ b/tokenizer/text.c @@ -0,0 +1,123 @@ +#include "assert.h" +#include "emmintrin.h" +#include "immintrin.h" +#include "stdint.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" + +#include "text.h" + +static inline uint16_t eq_mask(__m128i block, char c) { + return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c))); +} + +static inline int is_token_char(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || c == '_' || c == '*'; +} + +int32_t asciionly(const char *data, size_t len) { + char high = 0x80; + __m128i mask = _mm_set1_epi8(high); + + size_t i; + int32_t result = 1; + for (i = 0; i + 16 < len; i += 16) { + __m128i input = _mm_lddqu_si128((__m128i_u *)(data + i)); + __m128i masked = _mm_and_si128(input, mask); + + result &= (_mm_movemask_epi8(masked) == 0); + if (!result) + return 0; + } + + for (; i < len; i++) + result &= ((data[i] & high) == 0); + + return result; +} + +int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) { + __m128i block_lo = _mm_set1_epi8(lo); + __m128i block_zero = _mm_set1_epi8((char)0x0); + __m128i block_range = _mm_set1_epi8(hi - lo); + + __m128i_u res = _mm_sub_epi8(block, block_lo); + res = _mm_subs_epu8(res, block_range); + + return _mm_movemask_epi8(_mm_cmpeq_epi8(res, block_zero)); +} + +int32_t tokenize(const char *text, size_t len, span *out, int out_cap) { + int count = 0; + int token_start = -1; + + size_t i; + for (i = 0; i + 16 <= len; i += 16) { + __m128i block = _mm_lddqu_si128((__m128i_u *)(text + i)); + + // I need to check PSHUFB approach. + // Seems like there is so much overhead in here. + uint16_t bitmap_token = + boundaries(block, 'a', 'z') | boundaries(block, 'A', 'Z') | + boundaries(block, '0', '9') | eq_mask(block, '_') | eq_mask(block, '*'); + uint16_t bitmap_delimeters = ~bitmap_token; + + if (token_start == -1) { + // Whole block of 16 bytes contains no text symbols. + if (bitmap_token == 0) + continue; + token_start = i + __builtin_ctz(bitmap_token); + } + + // Whole block of 16 bytes contains text symbols. + if (bitmap_delimeters == 0) + continue; + + while (bitmap_delimeters && count < out_cap) { + int pos = i + __builtin_ctz(bitmap_delimeters); + + if (token_start != -1 && pos > token_start) { + out[count++] = (span){ + .start = token_start, + .len = pos - token_start, + }; + token_start = -1; + } + + bitmap_delimeters &= bitmap_delimeters - 1; + if (token_start == -1 && bitmap_token) { + int bit = pos - i; + uint16_t remaining = bitmap_token & ~((1 << (bit + 1)) - 1); + if (remaining) + token_start = i + __builtin_ctz(remaining); + } + } + } + + for (size_t j = i; j < len; j++) { + if (is_token_char(text[j])) { + if (token_start == -1) + token_start = j; + continue; + } + + if (token_start != -1 && count < out_cap) { + out[count++] = (span){ + .start = token_start, + .len = j - token_start, + }; + token_start = -1; + } + } + + if (token_start != -1 && count < out_cap) { + out[count++] = (span){ + .start = token_start, + .len = len - token_start, + }; + } + + return count; +} diff --git a/tokenizer/text.h b/tokenizer/text.h new file mode 100644 index 00000000..6560f6f2 --- /dev/null +++ b/tokenizer/text.h @@ -0,0 +1,16 @@ +#include "assert.h" +#include "emmintrin.h" +#include "immintrin.h" +#include "stdint.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" + +typedef struct { + uint32_t start; + uint16_t len; +} span; + +int32_t asciionly(const char *data, size_t len); +int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi); +int32_t tokenize(const char *text, size_t len, span *out, int out_cap); diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go index 6a728a91..dba28f2f 100644 --- a/tokenizer/text_tokenizer.go +++ b/tokenizer/text_tokenizer.go @@ -1,8 +1,15 @@ package tokenizer +/* +#cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra +#include "text.h" +*/ +import "C" + import ( "unicode" "unicode/utf8" + "unsafe" "github.com/ozontech/seq-db/metric" ) @@ -47,6 +54,18 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel value = value[:maxLength] k := 0 + if asciiOnly(value) { + spans := tokenize(value) + + for _, s := range spans { + start, length := uint32(s.start), uint32(s.len) + token := value[start : start+length] + tokens = append(tokens, MetaToken{Key: name, Value: token}) + } + + return tokens + } + hasUpper := false asciiOnly := true // Loop over the string looking for tokens. @@ -103,3 +122,28 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel return tokens } + +func asciiOnly(s []byte) bool { + return int32(C.asciionly( + (*C.char)(unsafe.Pointer(unsafe.SliceData(s))), + C.size_t(len(s)), + )) == 1 +} + +func tokenize(text []byte) []C.span { + if len(text) == 0 { + return nil + } + + cap := len(text)/2 + 1 + buf := make([]C.span, cap) + + n := C.tokenize( + (*C.char)(unsafe.Pointer(&text[0])), + C.size_t(len(text)), + &buf[0], + C.int(cap), + ) + + return buf[:n] +} From e1d18b5e43dcb50bd1ae5e10f980f308a3df4195 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Sat, 28 Feb 2026 12:58:10 +0300 Subject: [PATCH 2/2] perf: reduce allocations --- tokenizer/text_tokenizer.go | 40 ++++++++++++++++++++++++-------- tokenizer/text_tokenizer_test.go | 26 +++++++++++++++++++++ tokenizer/{text.c => tokenize.c} | 11 +++++---- tokenizer/{text.h => tokenize.h} | 0 4 files changed, 63 insertions(+), 14 deletions(-) rename tokenizer/{text.c => tokenize.c} (93%) rename tokenizer/{text.h => tokenize.h} (100%) diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go index dba28f2f..14ce4a6b 100644 --- a/tokenizer/text_tokenizer.go +++ b/tokenizer/text_tokenizer.go @@ -2,11 +2,12 @@ package tokenizer /* #cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra -#include "text.h" +#include "tokenize.h" */ import "C" import ( + "sync" "unicode" "unicode/utf8" "unsafe" @@ -14,6 +15,13 @@ import ( "github.com/ozontech/seq-db/metric" ) +var spanBufPool = sync.Pool{ + New: func() any { + buf := make([]C.span, 64) + return &buf + }, +} + type TextTokenizer struct { maxTokenSize int caseSensitive bool @@ -54,18 +62,22 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel value = value[:maxLength] k := 0 - if asciiOnly(value) { - spans := tokenize(value) + bufp := spanBufPool.Get().(*[]C.span) + spans, ok := tokenize(value, *bufp) + if ok { for _, s := range spans { start, length := uint32(s.start), uint32(s.len) token := value[start : start+length] tokens = append(tokens, MetaToken{Key: name, Value: token}) } - + *bufp = spans[:cap(spans)] + spanBufPool.Put(bufp) return tokens } + panic("unreachable") + hasUpper := false asciiOnly := true // Loop over the string looking for tokens. @@ -130,20 +142,28 @@ func asciiOnly(s []byte) bool { )) == 1 } -func tokenize(text []byte) []C.span { +func tokenize(text []byte, buf []C.span) ([]C.span, bool) { if len(text) == 0 { - return nil + return buf[:0], true } - cap := len(text)/2 + 1 - buf := make([]C.span, cap) + required := len(text)/2 + 1 + if cap(buf) < required { + buf = make([]C.span, required) + } else { + buf = buf[:required] + } n := C.tokenize( (*C.char)(unsafe.Pointer(&text[0])), C.size_t(len(text)), &buf[0], - C.int(cap), + +C.int(required), ) - return buf[:n] + if n < 0 { + return nil, false + } + + return buf[:n], true } diff --git a/tokenizer/text_tokenizer_test.go b/tokenizer/text_tokenizer_test.go index b56c82b6..e4a8be99 100644 --- a/tokenizer/text_tokenizer_test.go +++ b/tokenizer/text_tokenizer_test.go @@ -187,3 +187,29 @@ func TestTextTokenizerUTF8(t *testing.T) { test("пРивеt世界", []string{"пРивеt世界"}) test("А", []string{"А"}) } + +func BenchmarkTokenize(b *testing.B) { + tokenizer := NewTextTokenizer(1000, false, true, 1024) + name := []byte("message") + + short := []byte("GET /api/v1/users 200 OK") + medium := []byte("2025-02-27T10:15:30Z INFO worker_3 processed request from 192.168.1.42 method=POST path=/api/v1/orders status=201 latency_ms=12 bytes=4096") + long := bytes.Repeat([]byte("connection_timeout from host=server42 region=eu_west error_code=ETIMEDOUT retry_count=3 "), 10) + + for _, tc := range []struct { + name string + data []byte + }{ + {"short_24B", short}, + {"medium_150B", medium}, + {"long_900B", long}, + } { + b.Run(tc.name, func(b *testing.B) { + b.SetBytes(int64(len(tc.data))) + var tokens []MetaToken + for b.Loop() { + tokens = tokenizer.Tokenize(tokens[:0], name, tc.data, 0) + } + }) + } +} diff --git a/tokenizer/text.c b/tokenizer/tokenize.c similarity index 93% rename from tokenizer/text.c rename to tokenizer/tokenize.c index 5b986a88..4a3ed639 100644 --- a/tokenizer/text.c +++ b/tokenizer/tokenize.c @@ -6,7 +6,7 @@ #include "stdlib.h" #include "string.h" -#include "text.h" +#include "tokenize.h" static inline uint16_t eq_mask(__m128i block, char c) { return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c))); @@ -17,14 +17,14 @@ static inline int is_token_char(char c) { ('0' <= c && c <= '9') || c == '_' || c == '*'; } -int32_t asciionly(const char *data, size_t len) { +int32_t asciionly(const char *text, size_t len) { char high = 0x80; __m128i mask = _mm_set1_epi8(high); size_t i; int32_t result = 1; for (i = 0; i + 16 < len; i += 16) { - __m128i input = _mm_lddqu_si128((__m128i_u *)(data + i)); + __m128i input = _mm_lddqu_si128((__m128i_u *)(text + i)); __m128i masked = _mm_and_si128(input, mask); result &= (_mm_movemask_epi8(masked) == 0); @@ -33,7 +33,7 @@ int32_t asciionly(const char *data, size_t len) { } for (; i < len; i++) - result &= ((data[i] & high) == 0); + result &= ((text[i] & high) == 0); return result; } @@ -50,6 +50,9 @@ int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) { } int32_t tokenize(const char *text, size_t len, span *out, int out_cap) { + if (!asciionly(text, len)) + return -1; + int count = 0; int token_start = -1; diff --git a/tokenizer/text.h b/tokenizer/tokenize.h similarity index 100% rename from tokenizer/text.h rename to tokenizer/tokenize.h