From fddcbef4a82148e8096632414e2ce56f854b0a48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Mon, 22 Apr 2024 23:33:47 +0800 Subject: [PATCH] Use eos token for padding lazily --- lib/bumblebee/text/pre_trained_tokenizer.ex | 23 +++++++-------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/lib/bumblebee/text/pre_trained_tokenizer.ex b/lib/bumblebee/text/pre_trained_tokenizer.ex index 66ce71fe..fe75fdda 100644 --- a/lib/bumblebee/text/pre_trained_tokenizer.ex +++ b/lib/bumblebee/text/pre_trained_tokenizer.ex @@ -131,10 +131,7 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do special_tokens: %{ unk: "<|endoftext|>", bos: "<|endoftext|>", - eos: "<|endoftext|>", - # CodeGen doesn't originally have a pad token, however when necessary - # we pad with the EOS token - pad: "<|endoftext|>" + eos: "<|endoftext|>" } }, distilbert: %{ @@ -152,20 +149,14 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do special_tokens: %{ unk: "<|endoftext|>", bos: "<|endoftext|>", - eos: "<|endoftext|>", - # GPT-NeoX doesn't originally have a pad token, however when necessary - # we pad with the EOS token - pad: "<|endoftext|>" + eos: "<|endoftext|>" } }, gpt2: %{ special_tokens: %{ unk: "<|endoftext|>", bos: "<|endoftext|>", - eos: "<|endoftext|>", - # GPT-2 doesn't originally have a pad token, however when necessary - # we pad with the EOS token - pad: "<|endoftext|>" + eos: "<|endoftext|>" } }, layout_lm: %{ @@ -175,10 +166,7 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do special_tokens: %{ eos: "", unk: "", - sep: "", - # Llama doesn't originally have a pad token, however when necessary - # we pad with the EOS token - pad: "" + sep: "" } }, mbart: %{ @@ -275,8 +263,11 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do def apply(tokenizer, input) do input = List.wrap(input) + # Some tokenizers don't specify a PAD token, in which case we use + # the EOS token for padding by default pad_token = tokenizer.special_tokens[:pad] || + tokenizer.special_tokens[:eos] || raise ArgumentError, "expected the tokenizer to defined a padding token, but none was found"