Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions tokenizer/text_tokenizer.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
package tokenizer

/*
#cgo CFLAGS: -O3 -msse3 -g -Wall -Wextra
#include "tokenize.h"
*/
import "C"

Check failure on line 7 in tokenizer/text_tokenizer.go

View workflow job for this annotation

GitHub Actions / lint

dupImport: package is imported 2 times under different aliases on lines 7 and 13 (gocritic)

import (
"sync"
"unicode"
"unicode/utf8"
"unsafe"

Check failure on line 13 in tokenizer/text_tokenizer.go

View workflow job for this annotation

GitHub Actions / lint

dupImport: package is imported 2 times under different aliases on lines 7 and 13 (gocritic)

"github.com/ozontech/seq-db/metric"
)

var spanBufPool = sync.Pool{
New: func() any {
buf := make([]C.span, 64)
return &buf
},
}

type TextTokenizer struct {
maxTokenSize int
caseSensitive bool
Expand Down Expand Up @@ -47,7 +62,23 @@
value = value[:maxLength]
k := 0

bufp := spanBufPool.Get().(*[]C.span)
spans, ok := tokenize(value, *bufp)

if ok {
for _, s := range spans {
start, length := uint32(s.start), uint32(s.len)
token := value[start : start+length]
tokens = append(tokens, MetaToken{Key: name, Value: token})
}
*bufp = spans[:cap(spans)]
spanBufPool.Put(bufp)
return tokens
}

panic("unreachable")

hasUpper := false

Check failure on line 81 in tokenizer/text_tokenizer.go

View workflow job for this annotation

GitHub Actions / lint

unreachable: unreachable code (govet)
asciiOnly := true
// Loop over the string looking for tokens.
// Token of TextTokenizer is a string that contains only letters, numbers, '*' or '_'.
Expand Down Expand Up @@ -103,3 +134,36 @@

return tokens
}

func asciiOnly(s []byte) bool {
return int32(C.asciionly(
(*C.char)(unsafe.Pointer(unsafe.SliceData(s))),
C.size_t(len(s)),
)) == 1
}

func tokenize(text []byte, buf []C.span) ([]C.span, bool) {
if len(text) == 0 {
return buf[:0], true
}

required := len(text)/2 + 1
if cap(buf) < required {
buf = make([]C.span, required)
} else {
buf = buf[:required]
}

n := C.tokenize(
(*C.char)(unsafe.Pointer(&text[0])),
C.size_t(len(text)),
&buf[0],
+C.int(required),
)

if n < 0 {
return nil, false
}

return buf[:n], true
}
26 changes: 26 additions & 0 deletions tokenizer/text_tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

func TestTokenizeEmptyValue(t *testing.T) {
testCase := []byte("")
tokenizer := NewTextTokenizer(1000, false, true, 1024)

Check failure on line 17 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer

tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
expected := []MetaToken{newMetaToken("message", "")}
Expand All @@ -24,7 +24,7 @@

func TestTokenizeSimple(t *testing.T) {
testCase := []byte("arr hello world")
tokenizer := NewTextTokenizer(1000, false, true, 1024)

Check failure on line 27 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer

tokens := tokenizer.Tokenize(nil, []byte("message"), testCase, maxTokenSizeDummy)
assert.Equal(t, newMetaToken("message", "arr"), tokens[0])
Expand All @@ -33,7 +33,7 @@
}

func TestTokenizeSimple2(t *testing.T) {
tokenizer := NewTextTokenizer(1000, false, true, 1024)

Check failure on line 36 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer
tokens := tokenizer.Tokenize(nil, []byte("message"), bytes.Clone(longDocument), maxTokenSizeDummy)

assert.Equal(t, newMetaToken("message", "t1"), tokens[0])
Expand All @@ -48,7 +48,7 @@

func TestTokenizePartialDefault(t *testing.T) {
const maxSize = 100500
tokenizer := NewTextTokenizer(maxSize, false, true, maxSize)

Check failure on line 51 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer
testCase := []byte(strings.Repeat("1", maxSize+1))

tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
Expand All @@ -60,7 +60,7 @@

func TestTokenizePartial(t *testing.T) {
const maxSize = 100500
tokenizer := NewTextTokenizer(maxSize, false, true, 0)

Check failure on line 63 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer
testCase := []byte(strings.Repeat("1", maxSize+1))

tokens := tokenizer.Tokenize(nil, []byte("message"), testCase, maxSize)
Expand All @@ -72,7 +72,7 @@

func TestTokenizePartialSkipDefault(t *testing.T) {
const maxSize = 100500
tokenizer := NewTextTokenizer(maxSize, false, false, maxSize)

Check failure on line 75 in tokenizer/text_tokenizer_test.go

View workflow job for this annotation

GitHub Actions / ci-micro-benchmarks

undefined: NewTextTokenizer
testCase := []byte(strings.Repeat("1", maxSize+1))

tokens := tokenizer.Tokenize([]MetaToken{}, []byte("message"), testCase, maxTokenSizeDummy)
Expand Down Expand Up @@ -187,3 +187,29 @@
test("пРивеt世界", []string{"пРивеt世界"})
test("А", []string{"А"})
}

func BenchmarkTokenize(b *testing.B) {
tokenizer := NewTextTokenizer(1000, false, true, 1024)
name := []byte("message")

short := []byte("GET /api/v1/users 200 OK")
medium := []byte("2025-02-27T10:15:30Z INFO worker_3 processed request from 192.168.1.42 method=POST path=/api/v1/orders status=201 latency_ms=12 bytes=4096")
long := bytes.Repeat([]byte("connection_timeout from host=server42 region=eu_west error_code=ETIMEDOUT retry_count=3 "), 10)

for _, tc := range []struct {
name string
data []byte
}{
{"short_24B", short},
{"medium_150B", medium},
{"long_900B", long},
} {
b.Run(tc.name, func(b *testing.B) {
b.SetBytes(int64(len(tc.data)))
var tokens []MetaToken
for b.Loop() {
tokens = tokenizer.Tokenize(tokens[:0], name, tc.data, 0)
}
})
}
}
126 changes: 126 additions & 0 deletions tokenizer/tokenize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#include "assert.h"
#include "emmintrin.h"
#include "immintrin.h"
#include "stdint.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"

#include "tokenize.h"

static inline uint16_t eq_mask(__m128i block, char c) {
return _mm_movemask_epi8(_mm_cmpeq_epi8(block, _mm_set1_epi8(c)));
}

static inline int is_token_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') || c == '_' || c == '*';
}

int32_t asciionly(const char *text, size_t len) {
char high = 0x80;
__m128i mask = _mm_set1_epi8(high);

size_t i;
int32_t result = 1;
for (i = 0; i + 16 < len; i += 16) {
__m128i input = _mm_lddqu_si128((__m128i_u *)(text + i));
__m128i masked = _mm_and_si128(input, mask);

result &= (_mm_movemask_epi8(masked) == 0);
if (!result)
return 0;
}

for (; i < len; i++)
result &= ((text[i] & high) == 0);

return result;
}

int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi) {
__m128i block_lo = _mm_set1_epi8(lo);
__m128i block_zero = _mm_set1_epi8((char)0x0);
__m128i block_range = _mm_set1_epi8(hi - lo);

__m128i_u res = _mm_sub_epi8(block, block_lo);
res = _mm_subs_epu8(res, block_range);

return _mm_movemask_epi8(_mm_cmpeq_epi8(res, block_zero));
}

int32_t tokenize(const char *text, size_t len, span *out, int out_cap) {
if (!asciionly(text, len))
return -1;

int count = 0;
int token_start = -1;

size_t i;
for (i = 0; i + 16 <= len; i += 16) {
__m128i block = _mm_lddqu_si128((__m128i_u *)(text + i));

// I need to check PSHUFB approach.
// Seems like there is so much overhead in here.
uint16_t bitmap_token =
boundaries(block, 'a', 'z') | boundaries(block, 'A', 'Z') |
boundaries(block, '0', '9') | eq_mask(block, '_') | eq_mask(block, '*');
uint16_t bitmap_delimeters = ~bitmap_token;

if (token_start == -1) {
// Whole block of 16 bytes contains no text symbols.
if (bitmap_token == 0)
continue;
token_start = i + __builtin_ctz(bitmap_token);
}

// Whole block of 16 bytes contains text symbols.
if (bitmap_delimeters == 0)
continue;

while (bitmap_delimeters && count < out_cap) {
int pos = i + __builtin_ctz(bitmap_delimeters);

if (token_start != -1 && pos > token_start) {
out[count++] = (span){
.start = token_start,
.len = pos - token_start,
};
token_start = -1;
}

bitmap_delimeters &= bitmap_delimeters - 1;
if (token_start == -1 && bitmap_token) {
int bit = pos - i;
uint16_t remaining = bitmap_token & ~((1 << (bit + 1)) - 1);
if (remaining)
token_start = i + __builtin_ctz(remaining);
}
}
}

for (size_t j = i; j < len; j++) {
if (is_token_char(text[j])) {
if (token_start == -1)
token_start = j;
continue;
}

if (token_start != -1 && count < out_cap) {
out[count++] = (span){
.start = token_start,
.len = j - token_start,
};
token_start = -1;
}
}

if (token_start != -1 && count < out_cap) {
out[count++] = (span){
.start = token_start,
.len = len - token_start,
};
}

return count;
}
16 changes: 16 additions & 0 deletions tokenizer/tokenize.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "assert.h"
#include "emmintrin.h"
#include "immintrin.h"
#include "stdint.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"

typedef struct {
uint32_t start;
uint16_t len;
} span;

int32_t asciionly(const char *data, size_t len);
int16_t boundaries(__m128i block, uint8_t lo, uint8_t hi);
int32_t tokenize(const char *text, size_t len, span *out, int out_cap);
Loading