Skip to content

Commit

Permalink
rebrand to tokenizers for consistency with upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
Daulet Zhanguzin committed Mar 31, 2023
1 parent ab26de2 commit 7cf9d9d
Show file tree
Hide file tree
Showing 9 changed files with 17 additions and 17 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
build:
@cd lib/tokenizer && cargo build --release
@cp lib/tokenizer/target/release/libtokenizer.a lib/
@cd lib/tokenizers && cargo build --release
@cp lib/tokenizers/target/release/libtokenizers.a lib/
@go build .

test: build
@go test -v ./... -count=1

clean:
rm -rf lib/libtokenizer.a lib/tokenizer/target
rm -rf lib/libtokenizers.a lib/tokenizers/target
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/daulet/tokenizer
module github.com/daulet/tokenizers

go 1.18

Expand Down
2 changes: 1 addition & 1 deletion lib/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1 @@
libtokenizer.a
libtokenizers.a
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion lib/tokenizer/Cargo.toml → lib/tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "tokenizer"
name = "tokenizers"
version = "0.9.0"
edition = "2021"

Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions tokenizer.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package tokenizer
package tokenizers

// TODO packaging: how do we build the rust lib for distribution?

/*
#cgo LDFLAGS: ./lib/libtokenizer.a -ldl -lstdc++
#cgo LDFLAGS: ./lib/libtokenizers.a -ldl -lstdc++
#include <stdlib.h>
#include "./lib/tokenizer.h"
#include "./lib/tokenizers.h"
*/
import "C"

Expand Down
16 changes: 8 additions & 8 deletions tokenizer_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package tokenizer_test
package tokenizers_test

import (
"math/rand"
"testing"

"github.com/daulet/tokenizer"
"github.com/daulet/tokenizers"

"github.com/stretchr/testify/assert"
)
Expand All @@ -14,21 +14,21 @@ import (
const vocabSize = 30522

func TestEncode(t *testing.T) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
tokens := tk.Encode("brown fox jumps over the lazy dog")
assert.Equal(t, []uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, tokens)
}

func TestDecode(t *testing.T) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
str := tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899})
assert.Equal(t, "brown fox jumps over the lazy dog", str)
}

func BenchmarkEncodeNTimes(b *testing.B) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
expected := []uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}
b.ResetTimer()
Expand All @@ -39,7 +39,7 @@ func BenchmarkEncodeNTimes(b *testing.B) {
}

func BenchmarkEncodeNChars(b *testing.B) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
input := make([]rune, 0, b.N)
for i := 0; i < b.N; i++ {
Expand All @@ -52,7 +52,7 @@ func BenchmarkEncodeNChars(b *testing.B) {
}

func BenchmarkDecodeNTimes(b *testing.B) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -62,7 +62,7 @@ func BenchmarkDecodeNTimes(b *testing.B) {
}

func BenchmarkDecodeNTokens(b *testing.B) {
tk := tokenizer.FromFile("./test/data/bert-base-uncased.json")
tk := tokenizers.FromFile("./test/data/bert-base-uncased.json")
defer tk.Close()
input := make([]uint32, 0, b.N)
for i := 0; i < b.N; i++ {
Expand Down

0 comments on commit 7cf9d9d

Please sign in to comment.