Skip to content

Commit

Permalink
Merge pull request #3544 from flairNLP/fix-t5-tokenizer
Browse files Browse the repository at this point in the history
fix T5 tokenizer loading
  • Loading branch information
alanakbik authored Oct 11, 2024
2 parents c674212 + 588279f commit 2993108
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions flair/embeddings/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
LayoutLMv2FeatureExtractor,
PretrainedConfig,
PreTrainedTokenizer,
T5TokenizerFast,
)
from transformers.tokenization_utils_base import LARGE_INTEGER
from transformers.utils import PaddingStrategy
Expand Down Expand Up @@ -444,7 +445,7 @@ def _tokenizer_from_bytes(cls, zip_data: BytesIO) -> PreTrainedTokenizer:
zip_obj = zipfile.ZipFile(zip_data)
with tempfile.TemporaryDirectory() as temp_dir:
zip_obj.extractall(temp_dir)
return AutoTokenizer.from_pretrained(temp_dir, add_prefix_space=True)
return AutoTokenizer.from_pretrained(temp_dir)

@classmethod
def _feature_extractor_from_bytes(cls, zip_data: Optional[BytesIO]) -> Optional[FeatureExtractionMixin]:
Expand All @@ -458,7 +459,13 @@ def _feature_extractor_from_bytes(cls, zip_data: Optional[BytesIO]) -> Optional[
def __tokenizer_bytes(self):
with tempfile.TemporaryDirectory() as temp_dir:
files = list(self.tokenizer.save_pretrained(temp_dir))
if self.tokenizer.is_fast and self.tokenizer.slow_tokenizer_class:
if (
self.tokenizer.is_fast
and self.tokenizer.slow_tokenizer_class
and not isinstance(
self.tokenizer, T5TokenizerFast
) # do not remove slow files for T5, as it can only be created from slow tokenizer with prefix space
):
vocab_files = self.tokenizer.slow_tokenizer_class.vocab_files_names.values()
files = [f for f in files if all(v not in f for v in vocab_files)]
zip_data = BytesIO()
Expand Down

0 comments on commit 2993108

Please sign in to comment.