-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_tokenizer_es.py
37 lines (29 loc) · 1.22 KB
/
train_tokenizer_es.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
from datasets import load_dataset
from tqdm.autonotebook import tqdm
dataset = load_dataset("avacaondata/europarl_en_es_v2", split='train')
# dataset = load_dataset("opus_books", "en-es", split='train')
def get_source_corpus():
for i in range(0, len(dataset), 1000):
yield dataset[i : i + 1000]["target_es"]
from tokenizers import (Regex, Tokenizer, decoders, models, normalizers,
pre_tokenizers, processors, trainers)
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Replace("``", '"'),
normalizers.Replace("''", '"'),
normalizers.NFKD(),
normalizers.StripAccents(),
normalizers.Replace(Regex(" {2,}"), " "),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_source_corpus(), trainer=trainer)
os.makedirs('vanilla-NMT/es', exist_ok=True)
tokenizer.save('vanilla-NMT/es/tokenizer.json')