diff --git a/tokenizer/text_tokenizer.go b/tokenizer/text_tokenizer.go index 6a728a91..14ce4a6b 100644 --- a/tokenizer/text_tokenizer.go +++ b/tokenizer/text_tokenizer.go @@ -1,12 +1,27 @@ package tokenizer +/* +#cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra +#include "tokenize.h" +*/ +import "C" + import ( + "sync" "unicode" "unicode/utf8" + "unsafe" "github.com/ozontech/seq-db/metric" ) +var spanBufPool = sync.Pool{ + New: func() any { + buf := make([]C.span, 64) + return &buf + }, +} + type TextTokenizer struct { maxTokenSize int caseSensitive bool @@ -47,6 +62,22 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel value = value[:maxLength] k := 0 + bufp := spanBufPool.Get().(*[]C.span) + spans, ok := tokenize(value, *bufp) + + if ok { + for _, s := range spans { + start, length := uint32(s.start), uint32(s.len) + token := value[start : start+length] + tokens = append(tokens, MetaToken{Key: name, Value: token}) + } + *bufp = spans[:cap(spans)] + spanBufPool.Put(bufp) + return tokens + } + + panic("unreachable") + hasUpper := false asciiOnly := true // Loop over the string looking for tokens. @@ -103,3 +134,36 @@ func (t *TextTokenizer) Tokenize(tokens []MetaToken, name, value []byte, maxFiel return tokens } + +func asciiOnly(s []byte) bool { + return int32(C.asciionly( + (*C.char)(unsafe.Pointer(unsafe.SliceData(s))), + C.size_t(len(s)), + )) == 1 +} + +func tokenize(text []byte, buf []C.span) ([]C.span, bool) { + if len(text) == 0 { + return buf[:0], true + } + + required := len(text)/2 + 1 + if cap(buf) < required { + buf = make([]C.span, required) + } else { + buf = buf[:required] + } + + n := C.tokenize( + (*C.char)(unsafe.Pointer(&text[0])), + C.size_t(len(text)), + &buf[0], + +C.int(required), + ) + + if n < 0 { + return nil, false + } + + return buf[:n], true +} diff --git a/tokenizer/text_tokenizer_test.go b/tokenizer/text_tokenizer_test.go index b56c82b6..e4a8be99 100644 --- a/tokenizer/text_tokenizer_test.go +++ b/tokenizer/text_tokenizer_test.go @@ -187,3 +187,29 @@ func TestTextTokenizerUTF8(t *testing.T) { test("пРивеt世界", []string{"пРивеt世界"}) test("А", []string{"А"}) } + +func BenchmarkTokenize(b *testing.B) { + tokenizer := NewTextTokenizer(1000, false, true, 1024) + name := []byte("message") + + short := []byte("GET /api/v1/users 200 OK") + medium := []byte("2025-02-27T10:15:30Z INFO worker_3 processed request from 192.168.1.42 method=POST path=/api/v1/orders status=201 latency_ms=12 bytes=4096") + long := bytes.Repeat([]byte("connection_timeout from host=server42 region=eu_west error_code=ETIMEDOUT retry_count=3 "), 10) + + for _, tc := range []struct { + name string + data []byte + }{ + {"short_24B", short}, + {"medium_150B", medium}, + {"long_900B", long}, + } { + b.Run(tc.name, func(b *testing.B) { + b.SetBytes(int64(len(tc.data))) + var tokens []MetaToken + for b.Loop() { + tokens = tokenizer.Tokenize(tokens[:0], name, tc.data, 0) + } + }) + } +} diff --git a/tokenizer/tokenize.c b/tokenizer/tokenize.c new file mode 100644 index 00000000..4a3ed639 --- /dev/null +++ b/tokenizer/tokenize.c @@ -0,0 +1,126 @@ +#include "assert.h" +#include "emmintrin.h" +#include "immintrin.h" +#include "stdint.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" + +#include "tokenize.h" + +static inline uint16_t eq_mask(__m128i block, char c) { + return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c))); +} + +static inline int is_token_char(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || c == '_' || c == '*'; +} + +int32_t asciionly(const char *text, size_t len) { + char high = 0x80; + __m128i mask = _mm_set1_epi8(high); + + size_t i; + int32_t result = 1; + for (i = 0; i + 16 < len; i += 16) { + __m128i input = _mm_lddqu_si128((__m128i_u *)(text + i)); + __m128i masked = _mm_and_si128(input, mask); + + result &= (_mm_movemask_epi8(masked) == 0); + if (!result) + return 0; + } + + for (; i < len; i++) + result &= ((text[i] & high) == 0); + + return result; +} + +int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) { + __m128i block_lo = _mm_set1_epi8(lo); + __m128i block_zero = _mm_set1_epi8((char)0x0); + __m128i block_range = _mm_set1_epi8(hi - lo); + + __m128i_u res = _mm_sub_epi8(block, block_lo); + res = _mm_subs_epu8(res, block_range); + + return _mm_movemask_epi8(_mm_cmpeq_epi8(res, block_zero)); +} + +int32_t tokenize(const char *text, size_t len, span *out, int out_cap) { + if (!asciionly(text, len)) + return -1; + + int count = 0; + int token_start = -1; + + size_t i; + for (i = 0; i + 16 <= len; i += 16) { + __m128i block = _mm_lddqu_si128((__m128i_u *)(text + i)); + + // I need to check PSHUFB approach. + // Seems like there is so much overhead in here. + uint16_t bitmap_token = + boundaries(block, 'a', 'z') | boundaries(block, 'A', 'Z') | + boundaries(block, '0', '9') | eq_mask(block, '_') | eq_mask(block, '*'); + uint16_t bitmap_delimeters = ~bitmap_token; + + if (token_start == -1) { + // Whole block of 16 bytes contains no text symbols. + if (bitmap_token == 0) + continue; + token_start = i + __builtin_ctz(bitmap_token); + } + + // Whole block of 16 bytes contains text symbols. + if (bitmap_delimeters == 0) + continue; + + while (bitmap_delimeters && count < out_cap) { + int pos = i + __builtin_ctz(bitmap_delimeters); + + if (token_start != -1 && pos > token_start) { + out[count++] = (span){ + .start = token_start, + .len = pos - token_start, + }; + token_start = -1; + } + + bitmap_delimeters &= bitmap_delimeters - 1; + if (token_start == -1 && bitmap_token) { + int bit = pos - i; + uint16_t remaining = bitmap_token & ~((1 << (bit + 1)) - 1); + if (remaining) + token_start = i + __builtin_ctz(remaining); + } + } + } + + for (size_t j = i; j < len; j++) { + if (is_token_char(text[j])) { + if (token_start == -1) + token_start = j; + continue; + } + + if (token_start != -1 && count < out_cap) { + out[count++] = (span){ + .start = token_start, + .len = j - token_start, + }; + token_start = -1; + } + } + + if (token_start != -1 && count < out_cap) { + out[count++] = (span){ + .start = token_start, + .len = len - token_start, + }; + } + + return count; +} diff --git a/tokenizer/tokenize.h b/tokenizer/tokenize.h new file mode 100644 index 00000000..6560f6f2 --- /dev/null +++ b/tokenizer/tokenize.h @@ -0,0 +1,16 @@ +#include "assert.h" +#include "emmintrin.h" +#include "immintrin.h" +#include "stdint.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" + +typedef struct { + uint32_t start; + uint16_t len; +} span; + +int32_t asciionly(const char *data, size_t len); +int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi); +int32_t tokenize(const char *text, size_t len, span *out, int out_cap);