Skip to content

Commit

Permalink
chore: remove normalizer from training pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
ericrallen committed Feb 11, 2025
1 parent 8f02592 commit 75e25fd
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ ignore = [
"E402", # false positives for local imports
"E501", # line too long
"TRY003", # external messages in exceptions are too verbose
]
]
3 changes: 0 additions & 3 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from tokenizers import Regex, Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import NFC
from tokenizers.pre_tokenizers import Split
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer
Expand Down Expand Up @@ -68,8 +67,6 @@
),
)

tokenizer.normalizer = NFC()

tokenizer.pre_tokenizer = Split(
pattern=Regex(TOKENIZER_CHUNK_PATTERN),
behavior="isolated",
Expand Down
4 changes: 1 addition & 3 deletions src/pgn_tokenizer/config/pgn-tokenizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@
"special": true
}
],
"normalizer": {
"type": "NFC"
},
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 75e25fd

Please sign in to comment.