diff --git a/README.md b/README.md index a3a6e28..9b3d688 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Open in Visual Studio Code](https://open.vscode.dev/badges/open-in-vscode.svg)](https://github.dev/Riccorl/transformers-embedder) [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/) -[![Transformers](https://img.shields.io/badge/4.12-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/) +[![Transformers](https://img.shields.io/badge/4.13-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black) [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml) diff --git a/setup.py b/setup.py index 9795330..20db2e6 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setuptools.setup( name="transformers_embedder", # Replace with your own username - version="1.8.2", + version="1.8.3", author="Riccardo Orlando", author_email="orlandoricc@gmail.com", description="Word level transformer based embeddings", diff --git a/transformers_embedder/embedder.py b/transformers_embedder/embedder.py index 51c79b3..b55e116 100644 --- a/transformers_embedder/embedder.py +++ b/transformers_embedder/embedder.py @@ -55,7 +55,7 @@ def __init__( ) -> None: super().__init__() if isinstance(model, str): - config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attention=True) + config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attentions=True) self.transformer_model = tr.AutoModel.from_pretrained(model, config=config) else: self.transformer_model = model @@ -194,7 +194,7 @@ def scatter_sum( def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding: """ - Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`. + Resizes input token embeddings' matrix of the model if :obj:`new_num_tokens != config.vocab_size`. Args: new_num_tokens (:obj:`int`): @@ -223,7 +223,7 @@ def save_pretrained(self, save_directory: Union[str, Path]): def broadcast(src: torch.Tensor, other: torch.Tensor): """ Minimal version of ``broadcast``, from `pytorch_scatter `_ - library, that is compatible for ONNX but works only for our case. + library, that is compatible with ONNX but works only for our case. Args: src (:obj:`torch.Tensor`): diff --git a/transformers_embedder/tokenizer.py b/transformers_embedder/tokenizer.py index 107a5b0..48f1c21 100644 --- a/transformers_embedder/tokenizer.py +++ b/transformers_embedder/tokenizer.py @@ -31,7 +31,7 @@ class Tokenizer: Language model name (or a transformer :obj:`PreTrainedTokenizer`. """ - def __init__(self, model: Union[str, tr.PreTrainedTokenizer]): + def __init__(self, model: Union[str, tr.PreTrainedTokenizer], language: str = "xx_sent_ud_sm"): if isinstance(model, str): # init HuggingFace tokenizer self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained(model) @@ -41,7 +41,10 @@ def __init__(self, model: Union[str, tr.PreTrainedTokenizer]): self.huggingface_tokenizer = model self.config = tr.AutoConfig.from_pretrained(self.huggingface_tokenizer.name_or_path) # simple tokenizer used if the input is `str` + # lazy load, None at first self.spacy_tokenizer = None + # default multilingual model + self.language = language # padding stuff # default, batch length is model max length self.subtoken_max_batch_len = self.huggingface_tokenizer.model_max_length @@ -116,9 +119,7 @@ def __call__( # self._type_checking(text, text_pair) # check if input is batched or a single sample - is_batched = bool( - isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) - ) + is_batched = bool(isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) # if text is str or a list of str and they are not split, then text needs to be tokenized if isinstance(text, str) or (not is_split_into_words and isinstance(text[0], str)): @@ -128,9 +129,7 @@ def __call__( else: text = [self.pretokenize(t, use_spacy=use_spacy) for t in text] text_pair = ( - [self.pretokenize(t, use_spacy=use_spacy) for t in text_pair] - if text_pair - else None + [self.pretokenize(t, use_spacy=use_spacy) for t in text_pair] if text_pair else None ) # get model max length if not specified by user @@ -297,9 +296,7 @@ def _build_tokens( offsets.append(len(words) - 1) # -1 because we want the last index return words, input_ids, token_type_ids, offsets - def pad_batch( - self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None - ) -> ModelInputs: + def pad_batch(self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None) -> ModelInputs: """ Pad the batch to its maximum length or to the specified :obj:`max_length`. @@ -366,9 +363,7 @@ def pad_sequence( padding = [value] * abs(length - len(sequence)) if isinstance(sequence, torch.Tensor): if len(sequence.shape) > 1: - raise ValueError( - f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`" - ) + raise ValueError(f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`") padding = torch.as_tensor(padding) if pad_to_left: if isinstance(sequence, torch.Tensor): @@ -470,9 +465,7 @@ def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs: """ # convert to tensor batch = { - k: torch.as_tensor(v) - if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor) - else v + k: torch.as_tensor(v) if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor) else v for k, v in batch.items() } return ModelInputs(batch) @@ -567,10 +560,7 @@ def is_type_correct(text_to_check: Any) -> bool: isinstance(text_to_check[0], str) or ( isinstance(text_to_check[0], (list, tuple)) - and ( - len(text_to_check[0]) == 0 - or isinstance(text_to_check[0][0], str) - ) + and (len(text_to_check[0]) == 0 or isinstance(text_to_check[0][0], str)) ) ) ) @@ -739,11 +729,8 @@ def to(self, device: Union[str, torch.device]) -> ModelInputs: """ if isinstance(device, (str, torch.device, int)): self.data = { - k: v.to(device=device) if isinstance(v, torch.Tensor) else v - for k, v in self.data.items() + k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items() } else: - logger.warning( - f"Attempting to cast to another type, {str(device)}. This is not supported." - ) + logger.warning(f"Attempting to cast to another type, {str(device)}. This is not supported.") return self diff --git a/transformers_embedder/utils.py b/transformers_embedder/utils.py index 380ff16..a02dff6 100644 --- a/transformers_embedder/utils.py +++ b/transformers_embedder/utils.py @@ -1,6 +1,5 @@ import importlib.util import logging -from typing import Optional, Tuple, Union, Any _torch_available = importlib.util.find_spec("torch") is not None _spacy_available = importlib.util.find_spec("spacy") is not None