Fix output attention bug. Update dependencies.

Riccorl · Dec 11, 2021 · 4a8630a · 4a8630a
1 parent 491839d
commit 4a8630a
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 [![Open in Visual Studio Code](https://open.vscode.dev/badges/open-in-vscode.svg)](https://github.dev/Riccorl/transformers-embedder)
 [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/)
-[![Transformers](https://img.shields.io/badge/4.12-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
+[![Transformers](https://img.shields.io/badge/4.13-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black)
 
 [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setuptools.setup(
     name="transformers_embedder",  # Replace with your own username
-    version="1.8.2",
+    version="1.8.3",
     author="Riccardo Orlando",
     author_email="orlandoricc@gmail.com",
     description="Word level transformer based embeddings",

diff --git a/transformers_embedder/embedder.py b/transformers_embedder/embedder.py
@@ -55,7 +55,7 @@ def __init__(
     ) -> None:
         super().__init__()
         if isinstance(model, str):
-            config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attention=True)
+            config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attentions=True)
             self.transformer_model = tr.AutoModel.from_pretrained(model, config=config)
         else:
             self.transformer_model = model
@@ -194,7 +194,7 @@ def scatter_sum(
 
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
         """
-        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+        Resizes input token embeddings' matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
 
         Args:
             new_num_tokens (:obj:`int`):
@@ -223,7 +223,7 @@ def save_pretrained(self, save_directory: Union[str, Path]):
     def broadcast(src: torch.Tensor, other: torch.Tensor):
         """
         Minimal version of ``broadcast``, from `pytorch_scatter <https://github.com/rusty1s/pytorch_scatter/>`_
-        library, that is compatible for ONNX but works only for our case.
+        library, that is compatible with ONNX but works only for our case.
 
         Args:
             src (:obj:`torch.Tensor`):

diff --git a/transformers_embedder/tokenizer.py b/transformers_embedder/tokenizer.py
@@ -31,7 +31,7 @@ class Tokenizer:
             Language model name (or a transformer :obj:`PreTrainedTokenizer`.
     """
 
-    def __init__(self, model: Union[str, tr.PreTrainedTokenizer]):
+    def __init__(self, model: Union[str, tr.PreTrainedTokenizer], language: str = "xx_sent_ud_sm"):
         if isinstance(model, str):
             # init HuggingFace tokenizer
             self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained(model)
@@ -41,7 +41,10 @@ def __init__(self, model: Union[str, tr.PreTrainedTokenizer]):
             self.huggingface_tokenizer = model
             self.config = tr.AutoConfig.from_pretrained(self.huggingface_tokenizer.name_or_path)
         # simple tokenizer used if the input is `str`
+        # lazy load, None at first
         self.spacy_tokenizer = None
+        # default multilingual model
+        self.language = language
         # padding stuff
         # default, batch length is model max length
         self.subtoken_max_batch_len = self.huggingface_tokenizer.model_max_length
@@ -116,9 +119,7 @@ def __call__(
         # self._type_checking(text, text_pair)
 
         # check if input is batched or a single sample
-        is_batched = bool(
-            isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-        )
+        is_batched = bool(isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
 
         # if text is str or a list of str and they are not split, then text needs to be tokenized
         if isinstance(text, str) or (not is_split_into_words and isinstance(text[0], str)):
@@ -128,9 +129,7 @@ def __call__(
             else:
                 text = [self.pretokenize(t, use_spacy=use_spacy) for t in text]
                 text_pair = (
-                    [self.pretokenize(t, use_spacy=use_spacy) for t in text_pair]
-                    if text_pair
-                    else None
+                    [self.pretokenize(t, use_spacy=use_spacy) for t in text_pair] if text_pair else None
                 )
 
         # get model max length if not specified by user
@@ -297,9 +296,7 @@ def _build_tokens(
         offsets.append(len(words) - 1)  # -1 because we want the last index
         return words, input_ids, token_type_ids, offsets
 
-    def pad_batch(
-        self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None
-    ) -> ModelInputs:
+    def pad_batch(self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None) -> ModelInputs:
         """
         Pad the batch to its maximum length or to the specified :obj:`max_length`.
 
@@ -366,9 +363,7 @@ def pad_sequence(
         padding = [value] * abs(length - len(sequence))
         if isinstance(sequence, torch.Tensor):
             if len(sequence.shape) > 1:
-                raise ValueError(
-                    f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`"
-                )
+                raise ValueError(f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`")
             padding = torch.as_tensor(padding)
         if pad_to_left:
             if isinstance(sequence, torch.Tensor):
@@ -470,9 +465,7 @@ def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs:
         """
         # convert to tensor
         batch = {
-            k: torch.as_tensor(v)
-            if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor)
-            else v
+            k: torch.as_tensor(v) if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor) else v
             for k, v in batch.items()
         }
         return ModelInputs(batch)
@@ -567,10 +560,7 @@ def is_type_correct(text_to_check: Any) -> bool:
                             isinstance(text_to_check[0], str)
                             or (
                                 isinstance(text_to_check[0], (list, tuple))
-                                and (
-                                    len(text_to_check[0]) == 0
-                                    or isinstance(text_to_check[0][0], str)
-                                )
+                                and (len(text_to_check[0]) == 0 or isinstance(text_to_check[0][0], str))
                             )
                         )
                     )
@@ -739,11 +729,8 @@ def to(self, device: Union[str, torch.device]) -> ModelInputs:
         """
         if isinstance(device, (str, torch.device, int)):
             self.data = {
-                k: v.to(device=device) if isinstance(v, torch.Tensor) else v
-                for k, v in self.data.items()
+                k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()
             }
         else:
-            logger.warning(
-                f"Attempting to cast to another type, {str(device)}. This is not supported."
-            )
+            logger.warning(f"Attempting to cast to another type, {str(device)}. This is not supported.")
         return self
diff --git a/transformers_embedder/utils.py b/transformers_embedder/utils.py
@@ -1,6 +1,5 @@
 import importlib.util
 import logging
-from typing import Optional, Tuple, Union, Any
 
 _torch_available = importlib.util.find_spec("torch") is not None
 _spacy_available = importlib.util.find_spec("spacy") is not None