Skip to content

Commit

Permalink
Fix output attention bug. Update dependencies.
Browse files Browse the repository at this point in the history
  • Loading branch information
Riccorl committed Dec 11, 2021
1 parent 491839d commit 4a8630a
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 31 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[![Open in Visual Studio Code](https://open.vscode.dev/badges/open-in-vscode.svg)](https://github.dev/Riccorl/transformers-embedder)
[![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/)
[![Transformers](https://img.shields.io/badge/4.12-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
[![Transformers](https://img.shields.io/badge/4.13-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black)

[![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setuptools.setup(
name="transformers_embedder", # Replace with your own username
version="1.8.2",
version="1.8.3",
author="Riccardo Orlando",
author_email="orlandoricc@gmail.com",
description="Word level transformer based embeddings",
Expand Down
6 changes: 3 additions & 3 deletions transformers_embedder/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(
) -> None:
super().__init__()
if isinstance(model, str):
config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attention=True)
config = tr.AutoConfig.from_pretrained(model, output_hidden_states=True, output_attentions=True)
self.transformer_model = tr.AutoModel.from_pretrained(model, config=config)
else:
self.transformer_model = model
Expand Down Expand Up @@ -194,7 +194,7 @@ def scatter_sum(

def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
"""
Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
Resizes input token embeddings' matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
Args:
new_num_tokens (:obj:`int`):
Expand Down Expand Up @@ -223,7 +223,7 @@ def save_pretrained(self, save_directory: Union[str, Path]):
def broadcast(src: torch.Tensor, other: torch.Tensor):
"""
Minimal version of ``broadcast``, from `pytorch_scatter <https://github.com/rusty1s/pytorch_scatter/>`_
library, that is compatible for ONNX but works only for our case.
library, that is compatible with ONNX but works only for our case.
Args:
src (:obj:`torch.Tensor`):
Expand Down
37 changes: 12 additions & 25 deletions transformers_embedder/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Tokenizer:
Language model name (or a transformer :obj:`PreTrainedTokenizer`.
"""

def __init__(self, model: Union[str, tr.PreTrainedTokenizer]):
def __init__(self, model: Union[str, tr.PreTrainedTokenizer], language: str = "xx_sent_ud_sm"):
if isinstance(model, str):
# init HuggingFace tokenizer
self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained(model)
Expand All @@ -41,7 +41,10 @@ def __init__(self, model: Union[str, tr.PreTrainedTokenizer]):
self.huggingface_tokenizer = model
self.config = tr.AutoConfig.from_pretrained(self.huggingface_tokenizer.name_or_path)
# simple tokenizer used if the input is `str`
# lazy load, None at first
self.spacy_tokenizer = None
# default multilingual model
self.language = language
# padding stuff
# default, batch length is model max length
self.subtoken_max_batch_len = self.huggingface_tokenizer.model_max_length
Expand Down Expand Up @@ -116,9 +119,7 @@ def __call__(
# self._type_checking(text, text_pair)

# check if input is batched or a single sample
is_batched = bool(
isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
)
is_batched = bool(isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))

# if text is str or a list of str and they are not split, then text needs to be tokenized
if isinstance(text, str) or (not is_split_into_words and isinstance(text[0], str)):
Expand All @@ -128,9 +129,7 @@ def __call__(
else:
text = [self.pretokenize(t, use_spacy=use_spacy) for t in text]
text_pair = (
[self.pretokenize(t, use_spacy=use_spacy) for t in text_pair]
if text_pair
else None
[self.pretokenize(t, use_spacy=use_spacy) for t in text_pair] if text_pair else None
)

# get model max length if not specified by user
Expand Down Expand Up @@ -297,9 +296,7 @@ def _build_tokens(
offsets.append(len(words) - 1) # -1 because we want the last index
return words, input_ids, token_type_ids, offsets

def pad_batch(
self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None
) -> ModelInputs:
def pad_batch(self, batch: Union[ModelInputs, Dict[str, list]], max_length: int = None) -> ModelInputs:
"""
Pad the batch to its maximum length or to the specified :obj:`max_length`.
Expand Down Expand Up @@ -366,9 +363,7 @@ def pad_sequence(
padding = [value] * abs(length - len(sequence))
if isinstance(sequence, torch.Tensor):
if len(sequence.shape) > 1:
raise ValueError(
f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`"
)
raise ValueError(f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`")
padding = torch.as_tensor(padding)
if pad_to_left:
if isinstance(sequence, torch.Tensor):
Expand Down Expand Up @@ -470,9 +465,7 @@ def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs:
"""
# convert to tensor
batch = {
k: torch.as_tensor(v)
if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor)
else v
k: torch.as_tensor(v) if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor) else v
for k, v in batch.items()
}
return ModelInputs(batch)
Expand Down Expand Up @@ -567,10 +560,7 @@ def is_type_correct(text_to_check: Any) -> bool:
isinstance(text_to_check[0], str)
or (
isinstance(text_to_check[0], (list, tuple))
and (
len(text_to_check[0]) == 0
or isinstance(text_to_check[0][0], str)
)
and (len(text_to_check[0]) == 0 or isinstance(text_to_check[0][0], str))
)
)
)
Expand Down Expand Up @@ -739,11 +729,8 @@ def to(self, device: Union[str, torch.device]) -> ModelInputs:
"""
if isinstance(device, (str, torch.device, int)):
self.data = {
k: v.to(device=device) if isinstance(v, torch.Tensor) else v
for k, v in self.data.items()
k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()
}
else:
logger.warning(
f"Attempting to cast to another type, {str(device)}. This is not supported."
)
logger.warning(f"Attempting to cast to another type, {str(device)}. This is not supported.")
return self
1 change: 0 additions & 1 deletion transformers_embedder/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import importlib.util
import logging
from typing import Optional, Tuple, Union, Any

_torch_available = importlib.util.find_spec("torch") is not None
_spacy_available = importlib.util.find_spec("spacy") is not None
Expand Down

0 comments on commit 4a8630a

Please sign in to comment.