diff --git a/src/adapters/__init__.py b/src/adapters/__init__ 2.py similarity index 98% rename from src/adapters/__init__.py rename to src/adapters/__init__ 2.py index 20d8eaf77..a10439da6 100644 --- a/src/adapters/__init__.py +++ b/src/adapters/__init__ 2.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.0.0.dev0" +__version__ = "0.2.2" from typing import TYPE_CHECKING @@ -111,7 +111,6 @@ "models.mbart": ["MBartAdapterModel"], "models.mistral": ["MistralAdapterModel"], "models.mt5": ["MT5AdapterModel"], - "models.plbart": ["PLBartAdapterModel"], "models.roberta": ["RobertaAdapterModel"], "models.t5": ["T5AdapterModel"], "models.vit": ["ViTAdapterModel"], @@ -217,10 +216,9 @@ from .models.gpt2 import GPT2AdapterModel from .models.gptj import GPTJAdapterModel from .models.llama import LlamaAdapterModel - from .models.mbart import MBartAdapterModel from .models.mistral import MistralAdapterModel + from .models.mbart import MBartAdapterModel from .models.mt5 import MT5AdapterModel - from .models.plbart import PLBartAdapterModel from .models.roberta import RobertaAdapterModel from .models.t5 import T5AdapterModel from .models.vit import ViTAdapterModel diff --git a/src/adapters/composition.py b/src/adapters/composition 2.py similarity index 96% rename from src/adapters/composition.py rename to src/adapters/composition 2.py index 62c2854ac..16761f400 100644 --- a/src/adapters/composition.py +++ b/src/adapters/composition 2.py @@ -1,5 +1,4 @@ import itertools -import warnings from collections.abc import Sequence from typing import List, Optional, Set, Tuple, Union @@ -92,7 +91,7 @@ def __init__( self, *average_adapters: List[Union[AdapterCompositionBlock, str]], weights: Optional[List[float]] = None, - normalize_weights: bool = True, + normalize_weights: bool = True ): super().__init__(*average_adapters) if weights is not None: @@ -129,7 +128,6 @@ def __init__( "bart", "mbart", "mt5", - "plbart", "gpt2", "gptj", "t5", @@ -155,7 +153,7 @@ def validate_composition(adapter_composition: AdapterCompositionBlock, level=0, f"Models of type {model_type} don't support adapter composition using {block_type.__name__}." ) for child in adapter_composition: - if not type(child) in ALLOWED_NESTINGS[type(adapter_composition)]: + if type(child) not in ALLOWED_NESTINGS[type(adapter_composition)]: raise ValueError(f"Adapter setup is invalid. Cannot nest {child} in {adapter_composition}") # recursively validate children validate_composition(child, level=level + 1) @@ -181,11 +179,6 @@ def parse_composition(adapter_composition, level=0, model_type=None) -> AdapterC else: return adapter_composition elif isinstance(adapter_composition, Sequence): - # Functionality of adapter-transformers v1.x - warnings.warn( - "Passing list objects for adapter activation is deprecated. Please use Stack or Fuse explicitly.", - category=FutureWarning, - ) # for backwards compatibility if level == 1: block_class = Fuse diff --git a/src/adapters/configuration/adapter_fusion_config.py b/src/adapters/configuration/adapter_fusion_config.py index 6dc31dab1..552bcdbe6 100644 --- a/src/adapters/configuration/adapter_fusion_config.py +++ b/src/adapters/configuration/adapter_fusion_config.py @@ -36,7 +36,7 @@ def load(cls, config: Union[dict, str], **kwargs): dict: The resolved adapter fusion configuration dictionary. """ # currently storing AdapterFusion weights on AdapterHub is not supported. - config_dict = resolve_adapter_config(config, local_map=ADAPTERFUSION_CONFIG_MAP) + config_dict = resolve_adapter_config(config, local_map=ADAPTERFUSION_CONFIG_MAP, try_loading_from_hub=False) # convert back to dict to allow attr overrides if isinstance(config_dict, AdapterFusionConfig): config_dict = config_dict.to_dict() diff --git a/src/adapters/configuration/model_adapters_config.py b/src/adapters/configuration/model_adapters_config.py index 3ae7dcf56..3f4c3023d 100644 --- a/src/adapters/configuration/model_adapters_config.py +++ b/src/adapters/configuration/model_adapters_config.py @@ -237,6 +237,5 @@ def build_full_config(adapter_config, model_config, save_id2label=False, **kwarg config_dict["config"] = adapter_config.to_dict() else: config_dict["config"] = adapter_config - # add lib name before version to distinguish from adapter-transformers - config_dict["version"] = "adapters." + __version__ + config_dict["version"] = __version__ return config_dict diff --git a/src/adapters/context.py b/src/adapters/context.py deleted file mode 100644 index 70e685d03..000000000 --- a/src/adapters/context.py +++ /dev/null @@ -1,151 +0,0 @@ -import functools -import threading - -from .composition import parse_composition, parse_heads_from_composition - - -class AdapterSetup: - """ - Represents an adapter setup of a model including active adapters and active heads. This class is intended to be - used as a context manager using the ``with`` statement. The setup defined by the ``AdapterSetup`` context will - override static adapter setups defined in a model (i.e. setups specified via ``active_adapters``). - - Example:: - - with AdapterSetup(Stack("a", "b")): - # will use the adapter stack "a" and "b" outputs = model(**inputs) - - Note that the context manager is thread-local, i.e. it can be used with different setups in a multi-threaded - environment. - """ - - # thread-local storage that holds a stack of active contexts - storage = threading.local() - - def __init__(self, adapter_setup, head_setup=None, ignore_empty: bool = False): - self.adapter_setup = parse_composition(adapter_setup) - if head_setup: - self.head_setup = head_setup - else: - self.head_setup = parse_heads_from_composition(self.adapter_setup) - self._empty = ignore_empty and self.adapter_setup is None and self.head_setup is None - - def __enter__(self): - if not self._empty: - AdapterSetup.get_contexts().append(self) - return self - - def __exit__(self, type, value, traceback): - if not self._empty: - AdapterSetup.get_contexts().pop() - - @classmethod - def get_contexts(cls): - if not hasattr(cls.storage, "contexts"): - cls.storage.contexts = [] - return cls.storage.contexts - - @classmethod - def get_context(cls): - try: - return cls.get_contexts()[-1] - except IndexError: - return None - - @classmethod - def get_context_adapter_setup(cls): - context = cls.get_context() - if context: - return context.adapter_setup - return None - - @classmethod - def get_context_head_setup(cls): - context = cls.get_context() - if context: - return context.head_setup - return None - - -class ForwardContext: - """ - Holds context information during a forward pass through a model. This class should be used via the - ``ForwardContext.wrap()`` method. - - Note that the context is thread-local. - """ - - # thread-local storage that holds a stack of active contexts - storage = threading.local() - - context_attributes = [ - "adapter_gating_scores", - "adapter_fusion_attentions", - "adapter_input_parallelized", - ] - # Additional used attributes not exposed to the user - # - prompt_tokens_length: length of the prompt tokens - - def __init__(self, model, *args, **kwargs): - # If the model has a method ``forward_context()``, use it to create the context. - if hasattr(model, "forward_context"): - model.forward_context(self, *args, **kwargs) - - def __enter__(self): - ForwardContext.get_contexts().append(self) - return self - - def __exit__(self, type, value, traceback): - ForwardContext.get_contexts().pop() - - @classmethod - def wrap(cls, f): - """ - Decorator method that wraps a ``forward()`` function of a model class. - """ - - @functools.wraps(f) - def wrapper_func(self, *args, **kwargs): - if self.adapters_config is not None: - with cls(self, *args, **kwargs) as ctx: - # whether to output the context attributes - output_context = kwargs.pop("output_context", False) - kwargs = { - k: v for k, v in kwargs.items() if k.replace("output_", "") not in cls.context_attributes - } - results = f(self, *args, **kwargs) - - # append output attributes - if isinstance(results, tuple): - for attr in cls.context_attributes: - if getattr(ctx, "output_" + attr, False): - results = results + (dict(getattr(ctx, attr)),) - else: - for attr in cls.context_attributes: - if getattr(ctx, "output_" + attr, False): - results[attr] = dict(getattr(ctx, attr)) - - if output_context: - context_dict = ctx.__dict__ - - if output_context: - return results, context_dict - else: - return results - else: - return f(self, *args, **kwargs) - - return wrapper_func - - @classmethod - def get_contexts(cls): - if not hasattr(cls.storage, "contexts"): - cls.storage.contexts = [] - return cls.storage.contexts - - @classmethod - def get_context(cls): - try: - return cls.get_contexts()[-1] - except IndexError: - return None diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils 2.py similarity index 95% rename from src/adapters/head_utils.py rename to src/adapters/head_utils 2.py index 8226d1ed6..60b2ce52b 100644 --- a/src/adapters/head_utils.py +++ b/src/adapters/head_utils 2.py @@ -369,27 +369,6 @@ }, "layers": ["lm_head"], }, - # PLBART - "PLBartForSequenceClassification": { - "config": { - "head_type": "classification", - "layers": 2, - "activation_function": "tanh", - }, - "layers": [ - None, - "classification_head.dense", - None, - None, - "classification_head.out_proj", - ], - }, - "PLBartForConditionalGeneration": { - "config": { - "head_type": "seq2seq_lm", - }, - "layers": ["lm_head"], - }, # MT5 "MT5ForConditionalGeneration": { "config": { @@ -673,15 +652,7 @@ }, "layers": [None, "qa_outputs"], }, - "LlamaForTokenClassification": { - "config": { - "head_type": "tagging", - "layers": 1, - "activation_function": None, - }, - "layers": [None, "score"], - }, - # Mistral + #Mistral "MistralForSequenceClassification": { "config": { "head_type": "classification", @@ -698,14 +669,6 @@ }, "layers": ["lm_head"], }, - "MistralForTokenClassification": { - "config": { - "head_type": "tagging", - "layers": 1, - "activation_function": None, - }, - "layers": [None, "score"], - }, # Electra "ElectraForTokenClassification": { "config": { diff --git a/src/adapters/heads/model_mixin.py b/src/adapters/heads 2/model_mixin 2.py similarity index 99% rename from src/adapters/heads/model_mixin.py rename to src/adapters/heads 2/model_mixin 2.py index 9a27bbd76..bc197ddbf 100644 --- a/src/adapters/heads/model_mixin.py +++ b/src/adapters/heads 2/model_mixin 2.py @@ -134,8 +134,6 @@ def tie_weights(self): self = getattr(self, self.base_model_prefix) self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix) - super().tie_weights() - def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None): old_embeddings = self.get_input_embeddings() new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of) @@ -527,7 +525,7 @@ def forward_head( attention_mask=None, return_dict=False, context=None, - **kwargs, + **kwargs ): """ The forward pass through a prediction head configuration. There are three ways to specify the used prediction diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py deleted file mode 100644 index 82dd8097a..000000000 --- a/src/adapters/heads/base.py +++ /dev/null @@ -1,521 +0,0 @@ -import logging -from dataclasses import dataclass -from typing import List, Optional - -import torch -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.modeling_outputs import ( - ImageClassifierOutput, - MultipleChoiceModelOutput, - QuestionAnsweringModelOutput, - Seq2SeqModelOutput, - Seq2SeqQuestionAnsweringModelOutput, - Seq2SeqSequenceClassifierOutput, - SequenceClassifierOutput, - TokenClassifierOutput, -) -from transformers.utils import ModelOutput - -from ..composition import adjust_tensors_for_parallel -from ..methods.modeling import Activation_Function_Class - - -logger = logging.getLogger(__name__) - - -@dataclass -class MultiHeadOutput(ModelOutput): - head_outputs: List[ModelOutput] = None - loss: Optional[torch.FloatTensor] = None - - @property - def logits(self): - return torch.vstack([outputs["logits"] for outputs in self.head_outputs]) - - def __getitem__(self, k): - # with number indices the head output at that position is accessed - # e.g output[1] is equivalent to output.head_outputs[1] - if isinstance(k, int): - return self.head_outputs[k] - # with strings the attribute in the underlying dict can be adressed - # e.g output["loss"] is equivalent to output.loss - else: - return super().__getitem__(k) - - def __setitem__(self, k, v): - if isinstance(k, int): - self.head_outputs[k] = v - else: - return super().__setitem__(k, v) - - def __iter__(self): - # iterates over the head outputs - return iter(self.head_outputs) - - def __len__(self): - return len(self.head_outputs) - - -# Let this class inherit from nn.Sequential to provide iterable access as before -class PredictionHead(nn.Sequential): - def __init__(self, name): - super().__init__() - self.config = {} - self.name = name - - def _get_dropout_prob(self, model_config): - # try to infer dropout prob from various sources, default to 0.0 - if "dropout_prob" in self.config and self.config["dropout_prob"] is not None: - dropout_prob = self.config["dropout_prob"] - elif hasattr(model_config, "classifier_dropout") and model_config.classifier_dropout is not None: - dropout_prob = model_config.classifier_dropout - elif hasattr(model_config, "hidden_dropout_prob") and model_config.hidden_dropout_prob is not None: - dropout_prob = model_config.hidden_dropout_prob - else: - dropout_prob = 0.0 - - return dropout_prob - - def build(self, model): - model_config = model.config - pred_head = [] - dropout_prob = self._get_dropout_prob(model_config) - bias = self.config.get("bias", True) - for l_id in range(self.config["layers"]): - pred_head.append(nn.Dropout(dropout_prob)) - if l_id < self.config["layers"] - 1: - pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size)) - if self.config["activation_function"]: - pred_head.append(Activation_Function_Class(self.config["activation_function"])) - else: - if "num_labels" in self.config: - pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"], bias=bias)) - elif "num_choices" in self.config: # used for multiple_choice head - pred_head.append(nn.Linear(model_config.hidden_size, 1, bias=bias)) - else: - pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias)) - if self.config["activation_function"]: - pred_head.append(Activation_Function_Class(self.config["activation_function"])) - for i, module in enumerate(pred_head): - self.add_module(str(i), module) - - # We need to import the current value of _init_weights at each execution to determine if weights init is disabled. - from transformers.modeling_utils import _init_weights - - if _init_weights: - self.apply(model._init_weights) - self.train(model.training) # make sure training mode is consistent - - def get_output_embeddings(self): - return None # override for heads with output embeddings - - def get_label_names(self): - return ["labels"] - - def _get_cls_output(self, outputs, **kwargs): - if self.config["use_pooler"]: - cls_output = kwargs.pop("pooled_output") - elif kwargs.get("get_cls_from_eos_tokens", False): - x = outputs[0] # last hidden state - eos_mask = kwargs.get("eos_mask") - (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask) - if len(torch.unique(eos_mask.sum(1))) > 1: - raise ValueError("All examples must have the same number of tokens.") - cls_output = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :] - else: - cls_output = outputs[0][:, 0] - - return cls_output - - -class ClassificationHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_labels=2, - layers=2, - activation_function="tanh", - id2label=None, - use_pooler=False, - bias=True, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "classification", - "num_labels": num_labels, - "layers": layers, - "activation_function": activation_function, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "use_pooler": use_pooler, - "bias": bias, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs): - if cls_output is None: - cls_output = self._get_cls_output(outputs, **kwargs) - logits = super().forward(cls_output) - loss = None - labels = kwargs.pop("labels", None) - if labels is not None: - if self.config["num_labels"] == 1: - # We are doing regression - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1)) - - if return_dict: - if isinstance(outputs, Seq2SeqModelOutput): - return Seq2SeqSequenceClassifierOutput( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - else: - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = (logits,) + outputs[1:] - if labels is not None: - outputs = (loss,) + outputs - return outputs - - -class MultiLabelClassificationHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_labels=2, - layers=2, - activation_function="tanh", - id2label=None, - use_pooler=False, - bias=True, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "multilabel_classification", - "num_labels": num_labels, - "layers": layers, - "activation_function": activation_function, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "use_pooler": use_pooler, - "bias": bias, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs): - if cls_output is None: - cls_output = self._get_cls_output(outputs, **kwargs) - logits = super().forward(cls_output) - loss = None - labels = kwargs.pop("labels", None) - if labels is not None: - loss_fct = BCEWithLogitsLoss() - if labels.dtype != torch.float32: - labels = labels.float() - loss = loss_fct(logits, labels) - - if return_dict: - if isinstance(outputs, Seq2SeqModelOutput): - return Seq2SeqSequenceClassifierOutput( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - else: - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = (logits,) + outputs[1:] - if labels is not None: - outputs = (loss,) + outputs - return outputs - - -class MultipleChoiceHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_choices=2, - layers=2, - activation_function="tanh", - id2label=None, - use_pooler=False, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "multiple_choice", - "num_choices": num_choices, - "layers": layers, - "activation_function": activation_function, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "use_pooler": use_pooler, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=None, **kwargs): - if cls_output is None: - cls_output = self._get_cls_output(outputs, **kwargs) - logits = super().forward(cls_output) - logits = logits.view(-1, self.config["num_choices"]) - loss = None - labels = kwargs.pop("labels", None) - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits, labels) - - if return_dict: - return MultipleChoiceModelOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = (logits,) + outputs[1:] - if labels is not None: - outputs = (loss,) + outputs - return outputs - - -class TaggingHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_labels=2, - layers=1, - activation_function="tanh", - id2label=None, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "tagging", - "num_labels": num_labels, - "layers": layers, - "activation_function": activation_function, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs): - logits = super().forward(outputs[0]) - loss = None - - labels = kwargs.pop("labels", None) - if labels is not None: - loss_fct = CrossEntropyLoss() - # adjust labels for prompt tuning - if kwargs.get("prompt_tokens_length", 0) > 0: - prompt_length = kwargs.get("prompt_tokens_length") - prompt_labels = torch.full( - (labels.shape[0], prompt_length), loss_fct.ignore_index, dtype=torch.long, device=labels.device - ) - labels = torch.cat((prompt_labels, labels), dim=-1) - if attention_mask is not None: - attention_mask = torch.cat( - (torch.ones_like(prompt_labels, dtype=torch.long, device=labels.device), attention_mask), - dim=-1, - ) - - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.config["num_labels"]) - active_labels = torch.where( - active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) - ) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1)) - - if return_dict: - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = (logits,) + outputs[1:] - if labels is not None: - outputs = (loss,) + outputs - return outputs - - -class QuestionAnsweringHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_labels=2, - layers=1, - activation_function="tanh", - id2label=None, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "question_answering", - "num_labels": num_labels, - "layers": layers, - "activation_function": activation_function, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs): - sequence_output = outputs[0] - logits = super().forward(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - start_positions = kwargs.pop("start_positions", None) - end_positions = kwargs.pop("end_positions", None) - total_loss = None - if start_positions is not None and end_positions is not None: - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if return_dict: - if isinstance(outputs, Seq2SeqModelOutput): - return Seq2SeqQuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - else: - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = ( - start_logits, - end_logits, - ) + outputs[1:] - if total_loss is not None: - outputs = (total_loss,) + outputs - return outputs - - def get_label_names(self): - return ["start_positions", "end_positions"] - - -class ImageClassificationHead(PredictionHead): - def __init__( - self, - model, - head_name, - num_labels=2, - layers=2, - activation_function="tanh", - multilabel=False, - id2label=None, - use_pooler=False, - bias=True, - dropout_prob=None, - ): - super().__init__(head_name) - self.config = { - "head_type": "image_classification", - "num_labels": num_labels, - "layers": layers, - "activation_function": activation_function, - "multilabel": multilabel, - "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None, - "use_pooler": use_pooler, - "bias": bias, - "dropout_prob": dropout_prob, - } - self.build(model) - - def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs): - if cls_output is None: - cls_output = self._get_cls_output(outputs, **kwargs) - logits = super().forward(cls_output) - loss = None - labels = kwargs.pop("labels", None) - if labels is not None: - if self.config["num_labels"] == 1: - # We are doing regression - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - elif self.config["multilabel"]: - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - else: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1)) - - if return_dict: - return ImageClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - else: - outputs = (logits,) + outputs[1:] - if labels is not None: - outputs = (loss,) + outputs - return outputs diff --git a/src/adapters/heads/dependency_parsing.py b/src/adapters/heads/dependency_parsing.py index d2cbf98c0..d568f356b 100644 --- a/src/adapters/heads/dependency_parsing.py +++ b/src/adapters/heads/dependency_parsing.py @@ -2,7 +2,6 @@ Code taken and modified from: https://github.com/Adapter-Hub/hgiyt. Credits: "How Good is Your Tokenizer? On the Monolingual Performance of Multilingual Language Models" (Rust et al., 2021) https://arxiv.org/abs/2012.15613 """ - from dataclasses import dataclass from typing import Optional, Tuple @@ -97,7 +96,7 @@ def forward( word_starts=None, labels_arcs=None, labels_rels=None, - **kwargs, + **kwargs ): outs = self.dropout(outputs[0]) word_outputs_deps = self._merge_subword_tokens(outs, word_starts) diff --git a/src/adapters/hub_mixin.py b/src/adapters/hub_mixin 2.py similarity index 74% rename from src/adapters/hub_mixin.py rename to src/adapters/hub_mixin 2.py index c23c92eb7..7a1009c5b 100644 --- a/src/adapters/hub_mixin.py +++ b/src/adapters/hub_mixin 2.py @@ -1,5 +1,6 @@ import logging import os +import warnings from typing import List, Optional, Union from transformers.utils.generic import working_or_temp_dir @@ -35,7 +36,7 @@ from adapters import AutoAdapterModel model = AutoAdapterModel.from_pretrained("{model_name}") -adapter_name = model.load_adapter("{adapter_repo_name}", set_active=True) +adapter_name = model.load_adapter("{adapter_repo_name}", source="hf", set_active=True) ``` ## Architecture & Training @@ -61,21 +62,28 @@ def _save_adapter_card( save_directory: str, adapter_name: str, adapter_repo_name: str, + adapterhub_tag: Optional[str] = None, datasets_tag: Optional[str] = None, tags: Optional[List[str]] = None, language: Optional[str] = None, license: Optional[str] = None, metrics: Optional[List[str]] = None, - **kwargs, + **kwargs ): # Key remains "adapter-transformers", see: https://github.com/huggingface/huggingface.js/pull/459 all_tags = {"adapter-transformers"} datasets = set() # Dataset/ Task info dataset_name = None + if adapterhub_tag is None and datasets_tag is None: + raise ValueError("Either adapterhub_tag or datasets_tag must be specified.") if datasets_tag is not None: dataset_name = f"[{datasets_tag}](https://huggingface.co/datasets/{datasets_tag}/)" datasets.add(datasets_tag) + if adapterhub_tag is not None: + # adapterhub_tag overwrites datasets_tag + dataset_name = f"[{adapterhub_tag}](https://adapterhub.ml/explore/{adapterhub_tag}/)" + all_tags.add(f"adapterhub:{adapterhub_tag}") all_tags.add(self.config.model_type) if tags is not None: @@ -115,8 +123,10 @@ def _save_adapter_card( def push_adapter_to_hub( self, - repo_id: str, + repo_name: str, adapter_name: str, + organization: Optional[str] = None, + adapterhub_tag: Optional[str] = None, datasets_tag: Optional[str] = None, local_path: Optional[str] = None, commit_message: Optional[str] = None, @@ -127,15 +137,21 @@ def push_adapter_to_hub( revision: str = None, commit_description: str = None, adapter_card_kwargs: Optional[dict] = None, + **deprecated_kwargs, ): """Upload an adapter to HuggingFace's Model Hub. Args: - repo_id (str): The name of the repository on the model hub to upload to. + repo_name (str): The name of the repository on the model hub to upload to. adapter_name (str): The name of the adapter to be uploaded. organization (str, optional): Organization in which to push the adapter (you must be a member of this organization). Defaults to None. - datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. Defaults to + adapterhub_tag (str, optional): + Tag of the format `/` for categorization on https://adapterhub.ml/explore/. See + https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified, + `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. + datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. + If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to None. local_path (str, optional): Local path used as clone directory of the adapter repository. If not specified, will create a temporary directory. Defaults to None. @@ -160,6 +176,31 @@ def push_adapter_to_hub( Returns: str: The url of the adapter repository on the model hub. """ + use_auth_token = deprecated_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in future versions of Adapters." + " Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if organization is not None and not repo_name.startswith(organization): + warnings.warn( + "The `organization` argument is deprecated and will be removed in future versions of" + " Adapters. Set your organization directly in the `repo_id` passed instead" + " (`repo_id={organization}/{model_id}`)." + ) + if "/" in repo_name: + repo_name = repo_name.split("/")[-1] + repo_id = f"{organization}/{repo_name}" + else: + repo_id = repo_name + use_temp_dir = not os.path.isdir(local_path) if local_path else True # Create repo or get retrieve an existing repo @@ -177,6 +218,7 @@ def push_adapter_to_hub( work_dir, adapter_name, repo_id, + adapterhub_tag=adapterhub_tag, datasets_tag=datasets_tag, **adapter_card_kwargs, ) diff --git a/src/adapters/loading.py b/src/adapters/loading 2.py similarity index 98% rename from src/adapters/loading.py rename to src/adapters/loading 2.py index b1918b0a0..8d730680f 100644 --- a/src/adapters/loading.py +++ b/src/adapters/loading 2.py @@ -507,7 +507,7 @@ def load( loading_info=None, leave_out=None, set_active=False, - **kwargs, + **kwargs ): """ Loads a pre-trained pytorch adapter module from the local file system or a remote location. @@ -518,9 +518,9 @@ def load( - the identifier of a pre-trained task adapter to be loaded from Adapter Hub - a path to a directory containing adapter weights saved using `model.saved_adapter()` - a URL pointing to a zip folder containing a saved adapter module - config (str, optional): Deprecated. + config (str, optional): The requested configuration of the adapter. version (str, optional): The version of the adapter to be loaded. - model_name (str, optional): Deprecated. + model_name (str, optional): The string identifier of the pre-trained model. load_as (str, optional): Load the adapter using this name. By default, the name with which the adapter was saved will be used. @@ -528,13 +528,6 @@ def load( Tuple[str, str]: A tuple consisting of the local file system directory from which the weights where loaded and the name of the loaded weights. """ - # Warn about deprecated arguments - if config is not None or model_name is not None: - logger.warning( - "The 'config' and 'model_name' arguments are specific to the now unsupported legacy Hub repo and will" - " be removed." - "Please switch to only providing the HF Model Hub identifier.", - ) requested_config = AdapterConfig.load(config) if config else None # Resolve the weights to be loaded based on the given identifier and the current adapter config model_name = self.model.model_name or model_name diff --git a/src/adapters/methods/adapter_layer_base.py b/src/adapters/methods/adapter_layer_base.py index b03dd5e9c..04aa24927 100644 --- a/src/adapters/methods/adapter_layer_base.py +++ b/src/adapters/methods/adapter_layer_base.py @@ -90,25 +90,21 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool: """ raise NotImplementedError() - def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy, **kwargs) -> bool: + def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: """Averages a set of adapter modules into a new adapter module. Args: adapter_name (str): The name of the new (averaged) adapter module to add. - input_adapters (Dict[str, float]): Dictionary of adapter names and their corresponding weights. - combine_strategy (str): The strategy to combine the adapters. Available strategies depend on the used adapter method, see: https://docs.adapterhub.ml/adapter_composition.html#merging-adapters - **kwargs: Additional arguments that are specific to the combine_strategy. E.g. svd_rank for LoRA. + input_adapters (Dict[str, float]): Either: + - a list of adapter names (with equal weighting). + - a dictionary of adapter names and their corresponding weights. Returns: bool: True if the adapter was added, False otherwise. """ # add new adapter if self.add_adapter(adapter_name, self.layer_idx): - if combine_strategy != "linear": - # You get the adapter type from the input adapters - raise ValueError(f"Combine strategy {combine_strategy} not supported for the chosen adapter methods.") - - # average weights linearly + # average weights avg_state_dict = {} for name, weight in input_adapters.items(): if name in self.adapter_modules: @@ -121,10 +117,8 @@ def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float], c else: self.delete_adapter(adapter_name) # clean up before raising error raise ValueError("Adapter {} not found.".format(name)) - # load averaged weights self.adapter_modules[adapter_name].load_state_dict(avg_state_dict) - return True return False diff --git a/src/adapters/methods/bottleneck.py b/src/adapters/methods/bottleneck.py index fa66a095e..b3125c696 100644 --- a/src/adapters/methods/bottleneck.py +++ b/src/adapters/methods/bottleneck.py @@ -1,4 +1,4 @@ -from typing import List, Mapping, NamedTuple, Optional, Union +from typing import Dict, List, Mapping, NamedTuple, Optional, Union import torch from torch import nn @@ -94,6 +94,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool: return False + def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: + # add new adapter + if self.add_adapter(adapter_name, self.layer_idx): + # average weights + avg_state_dict = {} + for name, weight in input_adapters.items(): + if name in self.adapters: + module = self.adapters[name] + for k, v in module.state_dict().items(): + if k in avg_state_dict: + avg_state_dict[k] += weight * v + else: + avg_state_dict[k] = weight * v + else: + self.delete_adapter(adapter_name) # clean up before raising error + raise ValueError("Adapter {} not found.".format(name)) + # load averaged weights + self.adapters[adapter_name].load_state_dict(avg_state_dict) + return True + + return False + def add_fusion_layer(self, adapter_names: Union[List, str]): """See BertModel.add_fusion_layer""" adapter_names = adapter_names if isinstance(adapter_names, list) else adapter_names.split(",") @@ -173,11 +195,9 @@ def pad_and_concat(self, states: List[BottleneckState]) -> BottleneckState: torch.cat([state.input_tensor for state in states], dim=0), torch.cat([state.adapter_residual for state in states], dim=0), states[0].layer_norm, - ( - torch.cat([state.bottleneck_up for state in states], dim=0) - if states[0].bottleneck_up is not None - else None - ), + torch.cat([state.bottleneck_up for state in states], dim=0) + if states[0].bottleneck_up is not None + else None, states[-1].last, ) diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py index c62a94f26..e54042b55 100644 --- a/src/adapters/methods/lora.py +++ b/src/adapters/methods/lora.py @@ -224,109 +224,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool: return False - def average_adapter( - self, - adapter_name: str, - input_adapters: Dict[str, float], - combine_strategy: str, - svd_rank: int = None, - **kwargs, - ) -> bool: + def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: # add new adapter if self.add_adapter(adapter_name, self.layer_idx): + # average weights avg_state_dict = {} - - # First, check if all input adapters are present - for name in input_adapters.keys(): - if name not in self.loras: - self.delete_adapter(adapter_name) # clean up before raising error - raise ValueError("Adapter {} not found.".format(name)) - - # Now, combine the weights according to the strategy - if combine_strategy == "linear": - for name, weight in input_adapters.items(): + for name, weight in input_adapters.items(): + if name in self.loras: module = self.loras[name] for k, v in module.state_dict().items(): if k in avg_state_dict: avg_state_dict[k] += weight * v else: avg_state_dict[k] = weight * v - - elif combine_strategy == "lora_linear_only_negate_b": - # Same as linear but for negative weights only negate the B matrix and leave A positive - # See Zhang et al. (2023) https://proceedings.neurips.cc/paper_files/paper/2023/hash/299a08ee712d4752c890938da99a77c6-Abstract-Conference.html - for name, weight in input_adapters.items(): - module = self.loras[name] - for k, v in module.state_dict().items(): - if "lora_B" in k: - zhang_weight = weight - elif "lora_A" in k: - zhang_weight = abs(weight) - else: - # This should never happen as we only have lora_A and lora_B in the state_dict - raise ValueError( - f"Key must either contain 'lora_A' or 'lora_B' but is {k}. This should never" - " happen. Please open an issue on GitHub if you encounter this error." - ) - - if k in avg_state_dict: - avg_state_dict[k] += zhang_weight * v - else: - avg_state_dict[k] = zhang_weight * v - - elif combine_strategy == "lora_delta_w_svd": - # Weight the delta_w matrices by the input weights and then use Singular Value Decomposition (SVD) to split them into A and B matrices. - self._average_adapter_lora_delta_w_svd(input_adapters, avg_state_dict, svd_rank) - - else: - raise ValueError(f"The combine_strategy '{combine_strategy}' is not supported for LoRA.") - + else: + self.delete_adapter(adapter_name) # clean up before raising error + raise ValueError("Adapter {} not found.".format(name)) # load averaged weights self.loras[adapter_name].load_state_dict(avg_state_dict) return True return False - def _average_adapter_lora_delta_w_svd(self, input_adapters: Dict[str, float], avg_state_dict, svd_rank): - # Weight the delta_w matrices by the input weights and then use Singular Value Decomposition to split them into A and B matrices. - if svd_rank is None: - raise ValueError("svd_rank must be set when using 'lora_delta_w_svd'.") - - # Collect delta_w matrices. Shape of every delta_w matrix in the list: d×k - delta_w = [self.loras[adapter_name].delta_w for adapter_name in input_adapters.keys()] - - # If the lora has fan_in_fan_out, we need to transpose the matrices - if self.fan_in_fan_out: - delta_w = [torch.t(delta_w) for delta_w in delta_w] - - delta_w = torch.stack(delta_w, dim=0) # shape: n×d×k - - # Weight the delta_w matrices - weights = torch.tensor(list(input_adapters.values()), device=delta_w.device) # shape: n - weights = weights.view(-1, 1, 1) # shape: n×1×1 - delta_w = delta_w * weights # shape: n×d×k - - # Now bring down to d×k matrix - delta_w = delta_w.sum(dim=0) # shape: d×k - - # Perform SVD to split delta_w into A and B matrices - U, S_diag, V = torch.linalg.svd(delta_w) - - # Reduce rank - U = U[:, :svd_rank] # U is 2D - S_diag = S_diag[:svd_rank] # S_diag is 1D - V = V[:svd_rank, :] # V is 2D - - # The SVD has decomposed delta_w into U, S, and V such that: delta_w = U @ S_diag @ V - # In LoRA we have: delta_w = B @ A - # Hence, we can set: A = V and B = U @ S_diag - if self.fan_in_fan_out: - avg_state_dict["lora_A"] = torch.t(V) - avg_state_dict["lora_B"] = torch.t(U @ torch.diag(S_diag)) - else: - avg_state_dict["lora_A"] = V - avg_state_dict["lora_B"] = U @ torch.diag(S_diag) - class LoRAState(NamedTuple): """Models the input and output states of a LoRA layer. @@ -370,7 +289,7 @@ def __init__( attn_key: str = None, fan_in_fan_out: bool = False, no_init_bias: bool = False, - **kwargs, + **kwargs ): if no_init_bias and "bias" not in kwargs: kwargs["bias"] = False @@ -391,7 +310,7 @@ def wrap( model_config: PretrainedConfig, adapters_config: ModelAdaptersConfig, attn_key: str = None, - **kwargs, + **kwargs ): if isinstance(module, Conv1D): new_module = LoRALinearTorch( @@ -489,11 +408,9 @@ def repeat(self, state: LoRAState, channels: int) -> LoRAState: def mean(self, states: List[LoRAState], weights: torch.Tensor) -> LoRAState: return LoRAState( states[0].layer_input, - ( - torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0) - if states[0].hidden_states is not None - else None - ), + torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0) + if states[0].hidden_states is not None + else None, states[0].layer_output, states[-1].last, ) @@ -635,7 +552,7 @@ def __init__( adapters_config: ModelAdaptersConfig, fan_in_fan_out: bool = False, no_init_bias: bool = False, - **kwargs, + **kwargs ): if no_init_bias and "bias" not in kwargs: kwargs["bias"] = False @@ -654,7 +571,7 @@ def wrap( location_key: str, model_config: PretrainedConfig, adapters_config: ModelAdaptersConfig, - **kwargs, + **kwargs ): if isinstance(module, Conv1D): new_module = cls( diff --git a/src/adapters/methods/prefix_tuning.py b/src/adapters/methods/prefix_tuning.py index 1f7d4094b..5e98ca266 100644 --- a/src/adapters/methods/prefix_tuning.py +++ b/src/adapters/methods/prefix_tuning.py @@ -186,14 +186,8 @@ def confirm_prefix(self, prefix_name: str) -> bool: del self.prefix_counts[prefix_name] return True - def average_prefix( - self, prefix_name: str, input_adapters: Dict[str, float], combine_strategy: str, **kwargs - ) -> bool: + def average_prefix(self, prefix_name: str, input_adapters: Dict[str, float]) -> bool: if self.confirm_prefix(prefix_name): - # Prefix Tuning only support linear combination - if combine_strategy != "linear": - raise ValueError(f"Combine strategy {combine_strategy} not supported for prefix tuning.") - # average weights avg_state_dict = {} for name, weight in input_adapters.items(): @@ -343,15 +337,9 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool: return False - def average_adapter( - self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str, **kwargs - ) -> bool: + def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: # add new adapter if self.add_adapter(adapter_name, self.layer_idx): - # Prefix Tuning only support linear combination - if combine_strategy != "linear": - raise ValueError(f"Combine strategy {combine_strategy} not supported for prefix tuning.") - # prefix averaging is handled in pool, only average gates here if adapter_name in self.prefix_gates: avg_state_dict = {} @@ -442,8 +430,10 @@ def pad_and_concat(self, states: List[PrefixTuningState]) -> PrefixTuningState: value_states = F.pad(value_states, pad_size, "constant", self.model_config.pad_token_id) # pad attention mask - if pad_length > 0 and attention_mask is not None: + if pad_length > 0: # Masking the padded tokens only works correctly if attention_mask is set + # We assume this to be the case at this point + assert attention_mask is not None, "Attention mask must be set for prefix tuning" attention_mask = F.pad( attention_mask, (max_prefix_length - attention_mask.shape[-1], 0), diff --git a/src/adapters/methods/prompt_tuning.py b/src/adapters/methods/prompt_tuning.py index aaf729d9f..8ac2b5fff 100644 --- a/src/adapters/methods/prompt_tuning.py +++ b/src/adapters/methods/prompt_tuning.py @@ -1,7 +1,7 @@ # https://github.com/google-research/prompt-tuning/blob/main/prompt_tuning/train/prompts.py import math -from typing import Callable +from typing import Callable, Dict import numpy as np import torch @@ -161,6 +161,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool: return False + def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: + # add new adapter + if self.add_adapter(adapter_name, -1): + # average weights + avg_state_dict = {} + for name, weight in input_adapters.items(): + if name in self.prompt_tunings: + module = self.prompt_tunings[name] + for k, v in module.state_dict().items(): + if k in avg_state_dict: + avg_state_dict[k] += weight * v + else: + avg_state_dict[k] = weight * v + else: + self.delete_adapter(adapter_name) # clean up before raising error + raise ValueError("Adapter {} not found.".format(name)) + # load averaged weights + self.prompt_tunings[adapter_name].load_state_dict(avg_state_dict) + return True + + return False + def forward(self, hidden_states: torch.Tensor): prefix_attention_mask_length = None adapter_setup = self.get_active_setup() diff --git a/src/adapters/model_mixin.py b/src/adapters/model_mixin 2.py similarity index 86% rename from src/adapters/model_mixin.py rename to src/adapters/model_mixin 2.py index 180259581..c6c2ab597 100644 --- a/src/adapters/model_mixin.py +++ b/src/adapters/model_mixin 2.py @@ -1,19 +1,16 @@ import inspect import logging import os +import warnings from abc import ABC, abstractmethod from collections import defaultdict -from copy import deepcopy from os.path import join from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch from torch import nn -from adapters.configuration.adapter_config import ConfigUnion, LoRAConfig -from transformers import GenerationConfig from transformers.modeling_outputs import ModelOutput -from transformers.utils import is_accelerate_available from .composition import AdapterCompositionBlock, Fuse, Stack, parse_composition from .configuration import ADAPTER_CONFIG_MAP, AdapterConfig, AdapterFusionConfig, BnConfig @@ -33,9 +30,6 @@ logger = logging.getLogger(__name__) -if is_accelerate_available(): - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - class InvertibleAdaptersMixin: """Mixin for Transformer models adding invertible adapters.""" @@ -85,17 +79,9 @@ def add_invertible_adapter(self, adapter_name: str) -> bool: return False - def _average_invertible_adapter( - self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str - ) -> bool: + def _average_invertible_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: # add new adapter if self.add_invertible_adapter(adapter_name): - if combine_strategy != "linear": - raise ValueError( - f"Combine strategy {combine_strategy} not supported for invertible adapters. Only 'linear' is" - " supported." - ) - # average weights avg_state_dict = {} for name, weight in input_adapters.items(): @@ -185,13 +171,9 @@ def add_invertible_adapter(self, adapter_name: str) -> bool: return self.invertible_adapters_base.add_invertible_adapter(adapter_name) return False - def _average_invertible_adapter( - self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str - ) -> bool: + def _average_invertible_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool: if self.invertible_adapters_base is not None: - return self.invertible_adapters_base._average_invertible_adapter( - adapter_name, input_adapters, combine_strategy - ) + return self.invertible_adapters_base._average_invertible_adapter(adapter_name, input_adapters) return False def delete_invertible_adapter(self, adapter_name: str): @@ -384,9 +366,6 @@ class ModelAdaptersMixin(PushAdapterToHubMixin, ABC): """Mixin for transformer models adding support for loading/ saving adapters.""" add_base_adapters = False - support_lora_delta_w_svd = ( - True # If True, the model supports the "lora_delta_w_svd" combine_strategy to merge adapter weights. - ) support_prompt_tuning = True # If False, the prompt tuning layer is not added to the model. If True, the prompt tuning layer is added if add_base_adapters is True. def __init__(self, config, *args, **kwargs): @@ -504,6 +483,14 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra self.get_input_embeddings().train() self.get_input_embeddings().weight.requires_grad = True + def train_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False): + """Sets the model into mode for training of adapter fusion determined by a list of adapter names.""" + warnings.warn( + "add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.", + FutureWarning, + ) + self.train_adapter_fusion(adapter_setup, unfreeze_adapters=unfreeze_adapters) + def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False): """Sets the model into mode for training of adapter fusion determined by a list of adapter names.""" self.train() @@ -624,6 +611,14 @@ def _add_adapter_weights(self, adapter_name: str): if isinstance(self, InvertibleAdaptersMixin) or isinstance(self, InvertibleAdaptersWrapperMixin): self.add_invertible_adapter(adapter_name) + def add_fusion(self, adapter_names: Union[Fuse, list], adapter_fusion_config=None, override_kwargs=None): + warnings.warn( + "add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.", + FutureWarning, + ) + adapter_fusion_config = AdapterFusionConfig.from_dict(adapter_fusion_config).replace(**override_kwargs) + self.add_adapter_fusion(adapter_names, adapter_fusion_config) + def add_adapter_fusion( self, adapter_names: Union[Fuse, list, str], @@ -788,12 +783,13 @@ def load_adapter( version: str = None, model_name: str = None, load_as: str = None, + source: str = None, custom_weights_loaders: Optional[List[WeightsLoader]] = None, leave_out: Optional[List[int]] = None, id2label=None, set_active: bool = False, use_safetensors: bool = False, - **kwargs, + **kwargs ) -> str: """ Loads a pre-trained pytorch adapter module from the local file system or a remote location. @@ -804,11 +800,20 @@ def load_adapter( - the identifier of a pre-trained task adapter to be loaded from Adapter Hub - a path to a directory containing adapter weights saved using `model.saved_adapter()` - a URL pointing to a zip folder containing a saved adapter module - config (dict or str, optional): Deprecated. + config (dict or str, optional): The requested configuration of the adapter. + If not specified, will be either: - the default adapter config for the requested adapter if specified - + the global default adapter config version (str, optional): The version of the adapter to be loaded. - model_name (str, optional): Deprecated. + model_name (str, optional): The string identifier of the pre-trained model. load_as (str, optional): Load the adapter using this name. By default, the name with which the adapter was saved will be used. + source (str, optional): Identifier of the source(s) from where to load the adapter. Can be: + + - "ah": search on AdapterHub Hub repo. + Note: the Hub repo has been archived and all adapters have been moved to HuggingFace Model Hub. + Loading from this source is deprecated. + - "hf": search on HuggingFace Model Hub. + - None (default): search on all sources leave_out: Dynamically drop adapter modules in the specified Transformer layers when loading the adapter. set_active (bool, optional): Set the loaded adapter to be the active one. By default (False), the adapter is loaded but not @@ -825,6 +830,7 @@ def load_adapter( version, model_name, load_as, + source=source, leave_out=leave_out, set_active=set_active, **kwargs, @@ -849,7 +855,7 @@ def load_adapter_fusion( custom_weights_loaders: Optional[List[WeightsLoader]] = None, set_active: bool = False, use_safetensors: bool = False, - **kwargs, + **kwargs ) -> str: """ Loads a pre-trained AdapterFusion layer from the local file system. @@ -1162,12 +1168,7 @@ def adapter_summary(self, as_dict=False) -> Union[str, dict]: s.append("=" * total_length) return "\n".join(s) - def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str): - if combine_strategy != "linear": - raise ValueError( - f"Combine strategy {combine_strategy} not supported for shared parameters. Only 'linear' is supported." - ) - + def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str, float]): avg_state_dict = {} for name, weight in input_adapters.items(): if name in self.base_model.shared_parameters: @@ -1181,109 +1182,38 @@ def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str raise ValueError(f"Adapter {name} not found in shared parameters.") self.base_model.shared_parameters[adapter_name] = nn.ParameterDict(avg_state_dict) - def _pre_average_adapter_checks( - self, - adapter_name: str, - adapter_list: List[str], - combine_strategy: str, - valid_combination_strategies: List[str], - is_head=False, - ): - # Check if combine_strategy is valid - if combine_strategy not in valid_combination_strategies: - raise ValueError( - f"Invalid combine_strategy '{combine_strategy}'. Must be one of {valid_combination_strategies}" - ) - - # Some strategies are not supported by all models - if combine_strategy == "lora_delta_w_svd" and not self.base_model.support_lora_delta_w_svd: - raise ValueError( - "This model specifically does not support 'lora_delta_w_svd' as a merging method. Please use a" - " different combine_strategy or a different model." - ) - - head_or_adapter = "head" if is_head else "adapter" - - # Provide the user with some information about the adapters to be averaged - logging.info(f"Creating new {head_or_adapter} called {adapter_name} by averaging {adapter_list}.") - if not is_head: - logging.info("In case you want to create a new head as well please use the `average_head` function.") - - if len(adapter_list) == 0: - raise ValueError("No adapters to average. Please provide at least one adapter to average.") - if len(adapter_list) == 1: - logging.info( - "You provided only one adapter to average. If you set `normalize_weights` to true, this will result in" - " duplicating the adapter. If not this will result in scaling the adapter weights. We will use the" - " linear combination strategy for this." - ) - - # For ConfigUnion, only support linear combination - if isinstance(self.adapters_config.get(adapter_list[0]), ConfigUnion): - if combine_strategy != "linear": - raise ValueError( - "Combining adapters with ConfigUnion is only supported with the 'linear' combine_strategy." - ) - def average_adapter( self, adapter_name: str, - adapter_list: Union[List[str], Dict[str, float]], + adapter_list: List[str], weights: Optional[List[float]] = None, - combine_strategy: str = "linear", normalize_weights: bool = True, overwrite_ok: bool = False, set_active: bool = False, - svd_rank: int = None, # if other combination strategies are implemented that need new parameters, this should be moved to **kwargs ): """ Adds a new adapter module as weighted average of a set of existing adapter modules. Args: adapter_name (str): The name of the adapter module to be added. - adapter_list (List[str] or Dict[str, float]): + input_adapters (List[str] or Dict[str, float]): Specifies the existing adapters whose weights should be averaged. Can either be a list of adapter names or a dictionary mapping adapter names to weights. - weights (Optional[List[float]], optional): The weights corresponding to each adapter module in the list. - If not provided, equal weights will be assigned to each adapter. - combine_strategy (str, optional): The strategy to combine the adapter modules. - Available options are "linear", "lora_linear_only_negate_b", and "lora_delta_w_svd". - See https://docs.adapterhub.ml/adapter_composition.html#merging-adapters - Defaults to "linear". - normalize_weights (bool, optional): Whether to normalize the weights. - If True, the weights will be normalized to sum up to 1. - Defaults to True. overwrite_ok (bool, optional): Overwrite an adapter with the same name if it exists. By default (False), an exception is thrown. set_active (bool, optional): Set the adapter to be the active one. By default (False), the adapter is added but not activated. - svd_rank (int, optional): The rank to be used for Singular Value Decomposition (SVD) when averaging LoRA adapters. - This parameter is only applicable when the combine_strategy is set to "lora_delta_w_svd". - Defaults to None. """ - - valid_combination_strategies = ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"] - self._pre_average_adapter_checks(adapter_name, adapter_list, combine_strategy, valid_combination_strategies) - + # To be able to average the weights, all adapter configs must be the same config = None for name in adapter_list: if config is None: config = self.adapters_config.get(name) - elif get_adapter_config_hash(config, ignore_params=["dropout", "init_weights"]) != get_adapter_config_hash( - self.adapters_config.get(name), ignore_params=["dropout", "init_weights"] - ): + elif get_adapter_config_hash(config) != get_adapter_config_hash(self.adapters_config.get(name)): raise ValueError( "Cannot average adapters with different configurations. " "Please make sure all adapters have the same configuration." ) - - # In case svd_rank is set, change the config to use the new rank - if svd_rank is not None: - if isinstance(config, LoRAConfig): - config = config.replace(r=svd_rank) - else: - logging.warning("SVD rank can only be set when averaging LoRA adapters. Ignoring svd_rank.") - # In case adapter already exists and we allow overwriting, explicitly delete the existing one first if overwrite_ok and adapter_name in self.adapters_config: self.delete_adapter(adapter_name) @@ -1299,25 +1229,17 @@ def average_adapter( sum_weights = 1.0 input_adapters = {name: weight / sum_weights for name, weight in zip(adapter_list, weights)} try: - self.apply_to_adapter_layers( - lambda i, layer: layer.average_adapter( - adapter_name, input_adapters, combine_strategy, svd_rank=svd_rank - ) - ) - self.apply_to_basemodel_childs( - lambda i, child: child.average_adapter( - adapter_name, input_adapters, combine_strategy, svd_rank=svd_rank - ) - ) + self.apply_to_adapter_layers(lambda i, layer: layer.average_adapter(adapter_name, input_adapters)) + self.apply_to_basemodel_childs(lambda i, child: child.average_adapter(adapter_name, input_adapters)) # PHM Layer if self.adapters_config.match(adapter_name, BnConfig, location_key="phm_layer"): - self._average_shared_parameters(adapter_name, input_adapters, combine_strategy) + self._average_shared_parameters(adapter_name, input_adapters) # Prefix Tuning for module in self.modules(): if isinstance(module, PrefixTuningPool): - module.average_prefix(adapter_name, input_adapters, combine_strategy) + module.average_prefix(adapter_name, input_adapters) if isinstance(self, InvertibleAdaptersMixin) or isinstance(self, InvertibleAdaptersWrapperMixin): - self._average_invertible_adapter(adapter_name, input_adapters, combine_strategy) + self._average_invertible_adapter(adapter_name, input_adapters) except ValueError as ex: self.delete_adapter(adapter_name) raise ex @@ -1358,21 +1280,10 @@ def reset_adapter(self): # HACK Copied from transformers/generation/utils.py def _prepare_encoder_decoder_kwargs_for_generation( - self, - inputs_tensor: torch.Tensor, - model_kwargs, - model_input_name: Optional[str], - generation_config: GenerationConfig, + self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None ) -> Dict[str, Any]: # 1. get encoder encoder = self.get_encoder() - # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device - # as the inputs. - if hasattr(self, "hf_device_map"): - if hasattr(encoder, "_hf_hook"): - encoder._hf_hook.io_same_device = True - else: - add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True)) # 2. prepare encoder args and encoder kwargs from model kwargs irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] @@ -1381,6 +1292,7 @@ def _prepare_encoder_decoder_kwargs_for_generation( for argument, value in model_kwargs.items() if not any(argument.startswith(p) for p in irrelevant_prefix) } + encoder_signature = set(inspect.signature(encoder.forward).parameters) encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature if not encoder_accepts_wildcard: @@ -1389,8 +1301,6 @@ def _prepare_encoder_decoder_kwargs_for_generation( for argument, value in encoder_kwargs.items() if argument in encoder_signature or argument == "adapter_input_parallelized" } - encoder_kwargs["output_attentions"] = generation_config.output_attentions - encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states # 3. make sure that encoder returns `ModelOutput` model_input_name = model_input_name if model_input_name is not None else self.main_input_name @@ -1545,12 +1455,12 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra super().train_adapter(adapter_setup, train_embeddings) else: self.base_model.train_adapter(adapter_setup, train_embeddings) - - # If the head has tied weights with the embedding layer (e.g. masked language modeling head), the last layer is - # only trained when train_embeddings is set to True if not train_embeddings: self.freeze_embeddings() + # Hack to prevent HF Trainer from throwing an error due to peft missing. + self._hf_peft_config_loaded = True + def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False): """ Sets the model into mode for training of adapter fusion determined by a list of adapter names. If @@ -1562,93 +1472,6 @@ def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBloc self.base_model.train_adapter_fusion(adapter_setup, unfreeze_adapters=unfreeze_adapters) self.freeze_embeddings() - def average_head( - self, - head_name: str, - head_list: Union[List[str], Dict[str, float]], - weights: Optional[List[float]] = None, - normalize_weights: bool = True, - overwrite_ok: bool = False, - set_active: bool = False, - ): - """ - Adds a new prediction head as a weighted average of a set of existing prediction heads. - - Args: - head_name (str): The name of the new prediction head to be added. - head_list (List[str] or Dict[str, float]): - Specifies the existing heads whose weights should be averaged. Can either be a list of head names - or a dictionary mapping head names to weights. - weights (Optional[List[float]], optional): The weights corresponding to each head in the list. - If not provided, equal weights will be assigned to each head. - normalize_weights (bool, optional): Whether to normalize the weights. - If True, the weights will be normalized to sum up to 1. - Defaults to True. - overwrite_ok (bool, optional): - Overwrite a head with the same name if it exists. By default (False), an exception is thrown. - set_active (bool, optional): - Set the head to be the active one. By default (False), the head is added but not activated. - """ - - self._pre_average_adapter_checks( - head_name, head_list, "linear", ["linear"], is_head=True - ) # Currently, only linear averaging is supported for heads - - # Ensure all heads to be averaged are of the same class - head_class = type(self.heads[head_list[0]]) - for name in head_list: - if not isinstance(self.heads[name], head_class): - raise ValueError( - f"Cannot average heads of different classes. All heads must be of type {head_class.__name__}." - ) - - # Ensure that all heads have the same configuration - head_config = self.heads[head_list[0]].config - - for name in head_list: - if get_adapter_config_hash(head_config, ignore_params=["dropout_prob"]) != get_adapter_config_hash( - self.heads[name].config, ignore_params=["dropout_prob"] - ): - raise ValueError( - "Cannot average heads with different configurations. " - "Please make sure all heads have the same configuration." - ) - - # In case the head already exists and we allow overwriting, explicitly delete the existing one first - if overwrite_ok and head_name in self.heads: - self.delete_head(head_name) - - # Now that we have ensured that all heads are of the same class and have the same configuration, - # we can add the new head by copy one of the existing heads and then replacing the weights - new_head = deepcopy(self.heads[head_list[0]]) # This is a PredictionHead - new_head.name = head_name - - if weights is None: - eq_weight = 1.0 / len(head_list) - input_heads = {name: eq_weight for name in head_list} - else: - # Normalize weights if specified - if normalize_weights: - sum_weights = sum(weights) - else: - sum_weights = 1.0 - input_heads = {name: weight / sum_weights for name, weight in zip(head_list, weights)} - - # Average the state dictionaries of the heads - avg_state_dict = {} - for name, weight in input_heads.items(): - for k, v in self.heads[name].state_dict().items(): - if k in avg_state_dict: - avg_state_dict[k] += weight * v - else: - avg_state_dict[k] = weight * v - - # Load the averaged state dictionary into the new head - new_head.load_state_dict(avg_state_dict) - - # Add the new head to the model - self.add_prediction_head(new_head, set_active=set_active) - def save_head(self, save_directory: str, head_name: str = None, use_safetensors: bool = False) -> None: """Saves a model prediction head to a directory such that it can be reloaded using `load_head()`. @@ -1666,7 +1489,7 @@ def load_head( load_as: str = None, id2label: Dict[int, str] = None, use_safetensors: bool = False, - **kwargs, + **kwargs ) -> str: """Loads a model prediction head from a directory where it was saved using `save_head()`. @@ -1715,13 +1538,14 @@ def load_adapter( version: str = None, model_name: str = None, load_as: str = None, + source: str = None, with_head: bool = True, custom_weights_loaders: Optional[List[WeightsLoader]] = None, leave_out: Optional[List[int]] = None, id2label=None, set_active: bool = False, use_safetensors: bool = False, - **kwargs, + **kwargs ) -> str: if with_head: if custom_weights_loaders is None: @@ -1744,6 +1568,7 @@ def load_adapter( version=version, model_name=model_name, load_as=load_as, + source=source, custom_weights_loaders=custom_weights_loaders, leave_out=leave_out, id2label=id2label, @@ -1829,7 +1654,7 @@ def load_adapter_fusion( set_active: bool = False, with_head: bool = True, use_safetensors: bool = False, - **kwargs, + **kwargs ) -> str: if with_head: if custom_weights_loaders is None: diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py index 8e759698d..6d54544c2 100644 --- a/src/adapters/models/__init__.py +++ b/src/adapters/models/__init__.py @@ -14,18 +14,11 @@ CLIPTextTransformerAdaptersMixin, CLIPVisionModelAdaptersMixin, ) -from .deberta.mixin_deberta import DebertaModelAdaptersMixin from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin from .llama.mixin_llama import LlamaForQuestionAnsweringAdapterMixin, LlamaModelAdapterMixin from .mistral.mixin_mistral import MistralModelAdapterMixin -from .plbart.mixin_plbart import ( - PLBartDecoderAdaptersMixin, - PLBartDecoderWrapperAdaptersMixin, - PLBartEncoderAdaptersMixin, - PLBartModelAdaptersMixin, -) from .t5.mixin_t5 import ( T5BlockAdaptersMixin, T5ForCondiditionalGenerationWithHeadsMixin, @@ -41,8 +34,8 @@ "AlbertModel": AlbertModelAdaptersMixin, "BartEncoder": BartEncoderAdaptersMixin, "BartDecoder": BartDecoderAdaptersMixin, - "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin, "BartModel": BartModelAdaptersMixin, + "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin, "BeitIntermediate": BeitIntermediateAdaptersMixin, "BeitOutput": BeitOutputAdaptersMixin, "BeitModel": BeitModelAdaptersMixin, @@ -68,10 +61,6 @@ "MT5ForConditionalGeneration": T5ForCondiditionalGenerationWithHeadsMixin, "MT5ForQuestionAnswering": T5ForQuestionAnsweringWithHeadsMixin, "MT5EncoderModel": T5ModelAdaptersMixin, - "PLBartEncoder": PLBartEncoderAdaptersMixin, - "PLBartDecoder": PLBartDecoderAdaptersMixin, - "PLBartModel": PLBartModelAdaptersMixin, - "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin, "GPT2Model": GPT2ModelAdapterMixin, "GPTJMLP": GPTJMLPAdaptersMixin, "GPTJModel": GPTJModelAdapterMixin, @@ -88,9 +77,9 @@ "XLMRobertaModel": BertModelAdaptersMixin, "XmodLayer": BertLayerAdaptersMixin, "XmodModel": XmodModelAdaptersMixin, - "DebertaModel": DebertaModelAdaptersMixin, + "DebertaModel": BertModelAdaptersMixin, "DebertaLayer": BertLayerAdaptersMixin, - "DebertaV2Model": DebertaModelAdaptersMixin, + "DebertaV2Model": BertModelAdaptersMixin, "DebertaV2Layer": BertLayerAdaptersMixin, "BertGenerationEncoder": BertModelAdaptersMixin, "BertGenerationLayer": BertLayerAdaptersMixin, diff --git a/src/adapters/models/albert/adapter_model.py b/src/adapters/models/albert/adapter_model.py index 73892bb2f..8f6c07d47 100644 --- a/src/adapters/models/albert/adapter_model.py +++ b/src/adapters/models/albert/adapter_model.py @@ -52,7 +52,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py index 7ab5cd80f..6a962ff63 100644 --- a/src/adapters/models/auto/adapter_model.py +++ b/src/adapters/models/auto/adapter_model.py @@ -22,10 +22,9 @@ ("gpt2", "GPT2AdapterModel"), ("gptj", "GPTJAdapterModel"), ("llama", "LlamaAdapterModel"), - ("mbart", "MBartAdapterModel"), ("mistral", "MistralAdapterModel"), + ("mbart", "MBartAdapterModel"), ("mt5", "MT5AdapterModel"), - ("plbart", "PLBartAdapterModel"), ("roberta", "RobertaAdapterModel"), ("t5", "T5AdapterModel"), ("vit", "ViTAdapterModel"), diff --git a/src/adapters/models/bart/adapter_model.py b/src/adapters/models/bart/adapter_model.py index 4e07fc5f1..384955cc1 100644 --- a/src/adapters/models/bart/adapter_model.py +++ b/src/adapters/models/bart/adapter_model.py @@ -67,7 +67,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): @@ -127,7 +127,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs, + **kwargs ): # cut decoder_input_ids if past is used if past is not None: diff --git a/src/adapters/models/bart/modeling_bart.py b/src/adapters/models/bart/modeling_bart.py index 080455b49..b347fddf0 100644 --- a/src/adapters/models/bart/modeling_bart.py +++ b/src/adapters/models/bart/modeling_bart.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch BART model.""" +""" PyTorch BART model.""" from typing import Optional, Tuple import torch diff --git a/src/adapters/models/beit/modeling_beit.py b/src/adapters/models/beit/modeling_beit.py index 865fcdeae..1ed5082be 100644 --- a/src/adapters/models/beit/modeling_beit.py +++ b/src/adapters/models/beit/modeling_beit.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch BEiT model.""" +""" PyTorch BEiT model.""" import math @@ -33,9 +33,7 @@ def forward( hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, - interpolate_pos_encoding: bool = False, - resolution: Optional[Tuple[int]] = None, + relative_position_bias: Optional[BeitRelativePositionBias] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: mixed_query_layer = self.query(hidden_states) @@ -52,11 +50,7 @@ def forward( # Add relative position bias if present. if self.relative_position_bias is not None: - height, width = resolution - window_size = (height // self.config.patch_size, width // self.config.patch_size) - attention_scores = attention_scores + self.relative_position_bias( - window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1] - ) + attention_scores = attention_scores + self.relative_position_bias().unsqueeze(0) # Add shared relative position bias if provided. if relative_position_bias is not None: @@ -92,17 +86,13 @@ def forward( hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, - relative_position_bias: Optional["BeitRelativePositionBias"] = None, - interpolate_pos_encoding: bool = False, - resolution: Optional[Tuple[int]] = None, + relative_position_bias: Optional[BeitRelativePositionBias] = None, ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: self_attention_outputs = self.attention( self.layernorm_before(hidden_states), # in BEiT, layernorm is applied before self-attention head_mask, output_attentions=output_attentions, relative_position_bias=relative_position_bias, - interpolate_pos_encoding=interpolate_pos_encoding, - resolution=resolution, ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights diff --git a/src/adapters/models/bert/adapter_model.py b/src/adapters/models/bert/adapter_model.py index a15f3e432..0b8e18943 100644 --- a/src/adapters/models/bert/adapter_model.py +++ b/src/adapters/models/bert/adapter_model.py @@ -54,7 +54,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None diff --git a/src/adapters/models/bert/modeling_bert.py b/src/adapters/models/bert/modeling_bert.py index de860151e..ea60b6f5d 100644 --- a/src/adapters/models/bert/modeling_bert.py +++ b/src/adapters/models/bert/modeling_bert.py @@ -23,17 +23,13 @@ import torch.utils.checkpoint from torch import nn -from transformers.models.bert.modeling_bert import BertOutput, BertSdpaSelfAttention, BertSelfAttention, BertSelfOutput -from transformers.utils import logging +from transformers.models.bert.modeling_bert import BertOutput, BertSelfAttention, BertSelfOutput from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel from ...utils import prefix_attention_mask from .mixin_bert import BertOutputAdaptersMixin, BertSelfAttentionAdaptersMixin, BertSelfOutputAdaptersMixin -logger = logging.get_logger(__name__) - - class BertSelfAttentionWithAdapters(BertSelfAttentionAdaptersMixin, BertSelfAttention): def forward( self, @@ -146,107 +142,6 @@ def forward( return outputs -class BertSdpaSelfAttentionWithAdapters(BertSelfAttentionAdaptersMixin, BertSdpaSelfAttention): - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - attention_mask = prefix_attention_mask(attention_mask, [2, 3]) # type: ignore - - if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: - # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. - logger.warning_once( - "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support" - " non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to" - " the manual attention implementation, but specifying the manual implementation will be required from" - " Transformers version v5.0.0 onwards. This warning can be removed using the argument" - ' `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - ) - - bsz, tgt_len, _ = hidden_states.size() - - # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention - # mask needs to be such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - current_states = encoder_hidden_states if is_cross_attention else hidden_states - attention_mask = encoder_attention_mask if is_cross_attention else attention_mask - - # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning - if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: - key_layer, value_layer = past_key_value - else: - key_layer = self.transpose_for_scores(self.key(current_states)) - value_layer = self.transpose_for_scores(self.value(current_states)) - if past_key_value is not None and not is_cross_attention: - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - - query_layer = self.transpose_for_scores(self.query(hidden_states)) - query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer) - (attention_mask,) = adjust_tensors_for_parallel(query_layer, attention_mask) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_layer, value_layer) - - key_layer, value_layer, attention_mask = self.prefix_tuning( - key_layer, value_layer, hidden_states, attention_mask - ) - (query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer) - bsz = query_layer.size(0) - - # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom - # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. - # Reference: https://github.com/pytorch/pytorch/issues/112577 - if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: - query_layer = query_layer.contiguous() - key_layer = key_layer.contiguous() - value_layer = value_layer.contiguous() - - # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal - # mask in case tgt_len == 1. - is_causal = self.is_decoder and attention_mask is None and tgt_len > 1 - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attn_mask=attention_mask, - dropout_p=self.dropout_prob if self.training else 0.0, - is_causal=is_causal, - ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) - - outputs = (attn_output,) - if self.is_decoder: - outputs = outputs + (past_key_value,) - return outputs - - class BertSelfOutputWithAdapters(BertSelfOutputAdaptersMixin, BertSelfOutput): def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(hidden_states) diff --git a/src/adapters/models/bert_generation/adapter_model.py b/src/adapters/models/bert_generation/adapter_model.py index d3822e24a..072c1b099 100644 --- a/src/adapters/models/bert_generation/adapter_model.py +++ b/src/adapters/models/bert_generation/adapter_model.py @@ -54,7 +54,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None diff --git a/src/adapters/models/clip/adapter_model.py b/src/adapters/models/clip/adapter_model.py index 7734cd021..39382757e 100644 --- a/src/adapters/models/clip/adapter_model.py +++ b/src/adapters/models/clip/adapter_model.py @@ -42,7 +42,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): outputs, context = self.clip( input_ids=input_ids, diff --git a/src/adapters/models/clip/modeling_clip.py b/src/adapters/models/clip/modeling_clip.py index 7328e532c..b74a0308e 100644 --- a/src/adapters/models/clip/modeling_clip.py +++ b/src/adapters/models/clip/modeling_clip.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch CLIP model.""" +""" PyTorch CLIP model.""" from typing import Optional, Tuple @@ -21,25 +21,11 @@ import torch.utils.checkpoint from torch import nn -from transformers.models.clip.modeling_clip import ( - CLIPAttention, - CLIPEncoderLayer, - CLIPFlashAttention2, - CLIPSdpaAttention, -) -from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_2 -from transformers.utils import is_flash_attn_2_available, logging - - -if is_flash_attn_2_available(): - from transformers.modeling_flash_attention_utils import _flash_attention_forward +from transformers.models.clip.modeling_clip import CLIPAttention, CLIPEncoderLayer from .mixin_clip import CLIPAttentionAdaptersMixin, CLIPEncoderLayerAdaptersMixin -logger = logging.get_logger(__name__) - - class CLIPAttentionWithAdapters(CLIPAttentionAdaptersMixin, CLIPAttention): def forward( self, @@ -60,11 +46,9 @@ def forward( proj_shape = (bsz * self.num_heads, -1, self.head_dim) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) - # >>> END AH Changes <<< key_states = key_states.view(*proj_shape) value_states = value_states.view(*proj_shape) @@ -131,155 +115,6 @@ def forward( return attn_output, attn_weights_reshaped -class CLIPFlashAttention2WithAdapters(CLIPAttentionAdaptersMixin, CLIPFlashAttention2): - # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - output_attentions = False - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # >>> START AH Changes <<< - key_states, value_states, attention_mask = self.prefix_tuning( - key_states, value_states, hidden_states, attention_mask - ) - # >>> END AH Changes <<< - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim) - key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim) - value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim) - - dropout_rate = self.dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - is_causal=causal_attention_mask is not None, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - ) - - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous() - attn_output = self.out_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights - - -class CLIPSdpaAttentionWithAdapters(CLIPAttentionAdaptersMixin, CLIPSdpaAttention): - # Adapted from CLIPAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "CLIPModel is using CLIPSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not " - "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying " - "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can " - 'be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, - ) - - # CLIP text model uses both `causal_attention_mask` and `attention_mask` - if attention_mask is not None and causal_attention_mask is not None: - attn_mask = attention_mask + causal_attention_mask - elif causal_attention_mask is not None: - attn_mask = causal_attention_mask - else: - attn_mask = attention_mask - - bsz, tgt_len, embed_dim = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) - - # >>> START AH Changes <<< - key_states, value_states, attn_mask = self.prefix_tuning(key_states, value_states, hidden_states, attn_mask) - # >>> END AH Changes <<< - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - # CLIP text model uses both `causal_attention_mask` and `attention_mask` sequentially. - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attn_mask, - dropout_p=self.dropout if self.training else 0.0, - scale=self.scale, - ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, None - - class CLIPEncoderLayerWithAdapters(CLIPEncoderLayerAdaptersMixin, CLIPEncoderLayer): def forward( self, diff --git a/src/adapters/models/deberta/adapter_model.py b/src/adapters/models/deberta/adapter_model.py index f5e15e8cb..32ec9cd45 100644 --- a/src/adapters/models/deberta/adapter_model.py +++ b/src/adapters/models/deberta/adapter_model.py @@ -45,7 +45,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None diff --git a/src/adapters/models/deberta/mixin_deberta.py b/src/adapters/models/deberta/mixin_deberta.py index 817b9a83e..272302931 100644 --- a/src/adapters/models/deberta/mixin_deberta.py +++ b/src/adapters/models/deberta/mixin_deberta.py @@ -1,7 +1,6 @@ from ...methods.lora import LoRAMergedLinear from ...methods.prefix_tuning import PrefixTuningLayer from ...utils import patch_forward -from ..bert.mixin_bert import BertModelAdaptersMixin class DebertaSelfAttentionAdaptersMixin: @@ -15,8 +14,3 @@ def init_adapters(self, model_config, adapters_config): self.location_key + "_prefix" if self.location_key else None, model_config, adapters_config ) patch_forward(self) - - -class DebertaModelAdaptersMixin(BertModelAdaptersMixin): - # Same as BERT, except that Deberta does not support the "lora_delta_w_svd" combine_strategy - support_lora_delta_w_svd = False diff --git a/src/adapters/models/deberta/modeling_deberta.py b/src/adapters/models/deberta/modeling_deberta.py index 4380b5e03..1feca72b4 100644 --- a/src/adapters/models/deberta/modeling_deberta.py +++ b/src/adapters/models/deberta/modeling_deberta.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch DeBERTa model.""" +""" PyTorch DeBERTa model.""" import torch import torch.utils.checkpoint diff --git a/src/adapters/models/deberta_v2/adapter_model.py b/src/adapters/models/deberta_v2/adapter_model.py index 07092debd..c306f8f47 100644 --- a/src/adapters/models/deberta_v2/adapter_model.py +++ b/src/adapters/models/deberta_v2/adapter_model.py @@ -47,7 +47,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None diff --git a/src/adapters/models/deberta_v2/modeling_deberta_v2.py b/src/adapters/models/deberta_v2/modeling_deberta_v2.py index bc41ae82a..56d6fec44 100644 --- a/src/adapters/models/deberta_v2/modeling_deberta_v2.py +++ b/src/adapters/models/deberta_v2/modeling_deberta_v2.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch DeBERTa-v2 model.""" +""" PyTorch DeBERTa-v2 model.""" import torch import torch.utils.checkpoint diff --git a/src/adapters/models/distilbert/adapter_model.py b/src/adapters/models/distilbert/adapter_model.py index 3f38c893c..c28f12440 100644 --- a/src/adapters/models/distilbert/adapter_model.py +++ b/src/adapters/models/distilbert/adapter_model.py @@ -74,7 +74,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/adapters/models/distilbert/modeling_distilbert.py b/src/adapters/models/distilbert/modeling_distilbert.py index e59aa1ad5..cbd501942 100644 --- a/src/adapters/models/distilbert/modeling_distilbert.py +++ b/src/adapters/models/distilbert/modeling_distilbert.py @@ -14,8 +14,8 @@ # limitations under the License. """ -PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in -part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) + PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in + part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ diff --git a/src/adapters/models/electra/adapter_model.py b/src/adapters/models/electra/adapter_model.py index 57e20fadb..dbccce40d 100644 --- a/src/adapters/models/electra/adapter_model.py +++ b/src/adapters/models/electra/adapter_model.py @@ -54,7 +54,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None diff --git a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py index 1572087d9..43178898f 100644 --- a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Classes to support Encoder-Decoder architectures""" +""" Classes to support Encoder-Decoder architectures""" from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel diff --git a/src/adapters/models/gpt2/adapter_model.py b/src/adapters/models/gpt2/adapter_model.py index 2cfbdc882..041ab2a18 100644 --- a/src/adapters/models/gpt2/adapter_model.py +++ b/src/adapters/models/gpt2/adapter_model.py @@ -65,7 +65,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/adapters/models/gpt2/mixin_gpt2.py b/src/adapters/models/gpt2/mixin_gpt2.py index 3362fe4dc..d52952130 100644 --- a/src/adapters/models/gpt2/mixin_gpt2.py +++ b/src/adapters/models/gpt2/mixin_gpt2.py @@ -60,7 +60,6 @@ def init_adapters(self, model_config, adapters_config): class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin): support_prompt_tuning = False - support_lora_delta_w_svd = False def init_adapters(self, model_config, adapters_config): super().init_adapters(model_config, adapters_config) diff --git a/src/adapters/models/gpt2/modeling_gpt2.py b/src/adapters/models/gpt2/modeling_gpt2.py index bb6410f83..1c571c23f 100644 --- a/src/adapters/models/gpt2/modeling_gpt2.py +++ b/src/adapters/models/gpt2/modeling_gpt2.py @@ -20,16 +20,12 @@ import torch import torch.utils.checkpoint -from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2SdpaAttention -from transformers.utils import logging +from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_ from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin -logger = logging.get_logger(__name__) - - class GPT2AttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2Attention): def forward( self, @@ -69,10 +65,8 @@ def forward( else: present = None - # >>> START AH Changes <<< key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask) (query,) = adjust_tensors_for_parallel(key, query) - # >>> END AH Changes <<< if self.reorder_and_upcast_attn: attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask) @@ -90,104 +84,6 @@ def forward( return outputs # a, present, (attentions) -class GPT2SdpaAttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2SdpaAttention): - def forward( - self, - hidden_states: Optional[Tuple[torch.FloatTensor]], - layer_past: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = False, - output_attentions: Optional[bool] = False, - ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]: - if output_attentions or head_mask is not None: - logger.warning_once( - "`GPT2SdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " - "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - use_cache=use_cache, - output_attentions=output_attentions, - ) - - bsz, q_len, _ = hidden_states.size() - - # Initial attention projections - is_cross_attention = encoder_hidden_states is not None - if is_cross_attention: - if not hasattr(self, "q_attn"): - raise ValueError( - "If class is used as cross attention, the weights `q_attn` have to be defined. " - "Please make sure to instantiate class with `GPT2SdpaAttention(..., is_cross_attention=True)`." - ) - - query = self.q_attn(hidden_states) - key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) - attention_mask = encoder_attention_mask - else: - query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2) - - query = self._split_heads(query, self.num_heads, self.head_dim) - key = self._split_heads(key, self.num_heads, self.head_dim) - value = self._split_heads(value, self.num_heads, self.head_dim) - - # Optional kv caching - if layer_past is not None: - past_key = layer_past[0] - past_value = layer_past[1] - key = torch.cat((past_key, key), dim=-2) - value = torch.cat((past_value, value), dim=-2) - - present = None - if use_cache is True: - present = (key, value) - - # >>> START AH Changes <<< - key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask) - (query,) = adjust_tensors_for_parallel(key, query) - bsz = key.shape[0] - # >>> END AH Changes <<< - - # Avoid torch==2.1.2 specific bug for the memory-efficient backend in SDPA - if self.require_contiguous_qkv and query.device.type == "cuda" and attention_mask is not None: - query = query.contiguous() - key = key.contiguous() - value = value.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - is_causal = True if attention_mask is None and q_len > 1 and not is_cross_attention else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query, - key, - value, - attn_mask=attention_mask, - dropout_p=self.attn_dropout.p if self.training else 0.0, - is_causal=is_causal, - ) - - # Reshape outputs - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.embed_dim) - - # Final projection - attn_output = self.c_proj(attn_output) - attn_output = self.resid_dropout(attn_output) - - return attn_output, present, None - - class GPT2BlockWithAdapters(GPT2DecoderBlockAdaptersMixin, GPT2Block): def forward( self, diff --git a/src/adapters/models/gptj/adapter_model.py b/src/adapters/models/gptj/adapter_model.py index f029f840d..4553ebf2b 100644 --- a/src/adapters/models/gptj/adapter_model.py +++ b/src/adapters/models/gptj/adapter_model.py @@ -63,7 +63,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/adapters/models/gptj/modeling_gptj.py b/src/adapters/models/gptj/modeling_gptj.py index 3880df12c..700e919a1 100644 --- a/src/adapters/models/gptj/modeling_gptj.py +++ b/src/adapters/models/gptj/modeling_gptj.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch GPT-J model.""" +""" PyTorch GPT-J model.""" from typing import Optional, Tuple, Union diff --git a/src/adapters/models/llama/adapter_model.py b/src/adapters/models/llama/adapter_model.py index c3116fbe1..1076677ac 100644 --- a/src/adapters/models/llama/adapter_model.py +++ b/src/adapters/models/llama/adapter_model.py @@ -17,7 +17,7 @@ @add_start_docstrings( """ -The Llama Model that allows the loading of different heads for different tasks. This enables a flexible use of the +The Llama Model that allows the loading of different heads dor different tasks. This enables a flexible use of the models and adpters. Since this class does classification on the last token, it requires to know the position of the last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since @@ -64,7 +64,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( diff --git a/src/adapters/models/llama/modeling_llama.py b/src/adapters/models/llama/modeling_llama.py index 461cdde2b..f62091c47 100644 --- a/src/adapters/models/llama/modeling_llama.py +++ b/src/adapters/models/llama/modeling_llama.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch LLaMA model.""" +""" PyTorch LLaMA model.""" import math import warnings from typing import Optional, Tuple @@ -28,16 +28,8 @@ from torch import nn from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel -from transformers.cache_utils import Cache, StaticCache -from transformers.modeling_flash_attention_utils import _flash_attention_forward -from transformers.models.llama.modeling_llama import ( - LlamaAttention, - LlamaDecoderLayer, - LlamaFlashAttention2, - LlamaSdpaAttention, - apply_rotary_pos_emb, - repeat_kv, -) +from transformers.cache_utils import Cache +from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, apply_rotary_pos_emb, repeat_kv from transformers.utils import logging from .mixin_llama import LlamaAttentionMixin, LlamaDecoderLayerMixin @@ -58,7 +50,6 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -89,23 +80,13 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + past_key_value = getattr(self, "past_key_value", past_key_value) + cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -116,16 +97,15 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - bsz = key_states.shape[0] - # >>> END AH Changes <<< attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + bsz = key_states.shape[0] + if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask @@ -143,7 +123,7 @@ def forward( attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) if self.config.pretraining_tp > 1: attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) @@ -158,7 +138,7 @@ def forward( return attn_output, attn_weights, past_key_value -class LlamaFlashAttention2WithAdapters(LlamaAttentionMixin, LlamaFlashAttention2): +class LlamaFlashAttention2WithAdapters(LlamaAttentionMixin, LlamaAttention): def forward( self, hidden_states: torch.Tensor, @@ -168,15 +148,8 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if isinstance(past_key_value, StaticCache): - raise ValueError( - "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make" - " sure to use `sdpa` in the mean time, and open an issue at" - " https://github.com/huggingface/transformers" - ) - output_attentions = False bsz, q_len, _ = hidden_states.size() @@ -192,38 +165,27 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - # Make adjustments since (parallel) prefix tuning changes the attention mask + bsz = key_states.shape[0] - # >>> END AH Changes <<< + + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache # to be able to avoid many of these transpose/reshape/view. @@ -259,19 +221,11 @@ def forward( key_states = key_states.to(target_dtype) value_states = value_states.to(target_dtype) - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - sliding_window=getattr(self, "sliding_window", None), - use_top_left_mask=self._flash_attn_uses_top_left_mask, - is_causal=self.is_causal, + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate ) - attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: @@ -280,7 +234,7 @@ def forward( return attn_output, attn_weights, past_key_value -class LlamaSdpaAttentionWithAdapters(LlamaAttentionMixin, LlamaSdpaAttention): +class LlamaSdpaAttentionWithAdapters(LlamaAttentionMixin, LlamaAttention): # Adapted from LlamaAttention.forward def forward( @@ -292,8 +246,6 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 - **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -311,7 +263,6 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, ) bsz, q_len, _ = hidden_states.size() @@ -324,25 +275,17 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + # In case static cache is used, it is an instance attribute. + past_key_value = getattr(self, "past_key_value", past_key_value) + if past_key_value is not None: # sin and cos are specific to RoPE models; cache_position needed for the static cache cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} @@ -351,16 +294,15 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - # >>> END AH Changes <<< bsz = key_states.shape[0] causal_mask = attention_mask + # if attention_mask is not None and cache_position is not None: if attention_mask is not None: causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] @@ -371,17 +313,12 @@ def forward( key_states = key_states.contiguous() value_states = value_states.contiguous() - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - is_causal = True if causal_mask is None and q_len > 1 else False - attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=causal_mask, dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=is_causal, ) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/adapters/models/mbart/adapter_model.py b/src/adapters/models/mbart/adapter_model.py index ebbfb45ef..186aef5c0 100644 --- a/src/adapters/models/mbart/adapter_model.py +++ b/src/adapters/models/mbart/adapter_model.py @@ -68,7 +68,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): @@ -136,7 +136,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs, + **kwargs ): # cut decoder_input_ids if past is used if past is not None: diff --git a/src/adapters/models/mbart/modeling_mbart.py b/src/adapters/models/mbart/modeling_mbart.py index 45bdceae2..0f8f0d533 100644 --- a/src/adapters/models/mbart/modeling_mbart.py +++ b/src/adapters/models/mbart/modeling_mbart.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch MBART model.""" +""" PyTorch MBART model.""" from typing import Optional, Tuple import torch diff --git a/src/adapters/models/mistral/adapter_model.py b/src/adapters/models/mistral/adapter_model.py index 1909fccde..3897c377d 100644 --- a/src/adapters/models/mistral/adapter_model.py +++ b/src/adapters/models/mistral/adapter_model.py @@ -1,5 +1,5 @@ import logging - +from typing import Optional import torch from transformers.models.mistral.modeling_mistral import MISTRAL_START_DOCSTRING, MistralModel, MistralPreTrainedModel @@ -16,7 +16,7 @@ @add_start_docstrings( """ -The Mistal Model that allows the loading of different heads for different tasks. This enables a flexible use of the +The Mistal Model that allows the loading of different heads dor different tasks. This enables a flexible use of the models and adpters. Since this class does classification on the last token, it requires to know the position of the last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since @@ -56,13 +56,14 @@ def forward( past_key_values=None, inputs_embeds=None, use_cache=None, + cache_position: Optional[torch.LongTensor] = None, output_attentions=None, output_hidden_states=None, return_dict=None, head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -77,6 +78,7 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, + cache_position=cache_position, output_attentions=output_attentions, return_dict=return_dict, output_hidden_states=output_hidden_states, diff --git a/src/adapters/models/mistral/mixin_mistral.py b/src/adapters/models/mistral/mixin_mistral.py index 9acd17995..09c810e7f 100644 --- a/src/adapters/models/mistral/mixin_mistral.py +++ b/src/adapters/models/mistral/mixin_mistral.py @@ -6,6 +6,7 @@ from ...methods.lora import LoRALinear from ...methods.prefix_tuning import PrefixTuningLayer from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin +from ...utils import patch_forward class MistralAttentionMixin: @@ -16,6 +17,8 @@ def init_adapters(self, model_config, adapters_config): self.prefix_tuning = PrefixTuningLayer("self_prefix", model_config, adapters_config) + patch_forward(self) + class MistralDecoderLayerMixin: def init_adapters(self, model_config, adapters_config): @@ -26,6 +29,8 @@ def init_adapters(self, model_config, adapters_config): self.attention_adapters = BottleneckLayer("mh_adapter") self.output_adapters = BottleneckLayer("output_adapter") + patch_forward(self) + class MistralModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin): support_prompt_tuning = False diff --git a/src/adapters/models/mistral/modeling_mistral.py b/src/adapters/models/mistral/modeling_mistral.py index 00e020515..900d831e2 100644 --- a/src/adapters/models/mistral/modeling_mistral.py +++ b/src/adapters/models/mistral/modeling_mistral.py @@ -17,7 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch Mistral model.""" +""" PyTorch Mistral model.""" +import inspect import math from typing import Optional, Tuple @@ -25,17 +26,11 @@ import torch.utils.checkpoint from torch import nn -from adapters.composition import ( - adjust_tensors_for_parallel, - adjust_tensors_for_parallel_, - match_attn_matrices_for_parallel, -) +from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel from transformers.cache_utils import Cache, StaticCache from transformers.models.mistral.modeling_mistral import ( MistralAttention, MistralDecoderLayer, - MistralFlashAttention2, - MistralSdpaAttention, apply_rotary_pos_emb, repeat_kv, ) @@ -43,15 +38,18 @@ from .mixin_mistral import MistralAttentionMixin, MistralDecoderLayerMixin - if is_flash_attn_2_available(): - from transformers.models.mistral.modeling_mistral import _flash_supports_window_size + from flash_attn import flash_attn_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) logger = logging.get_logger(__name__) class MistralAttentionWithAdapters(MistralAttentionMixin, MistralAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + def forward( self, hidden_states: torch.Tensor, @@ -72,13 +70,12 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< + past_key_value = getattr(self, "past_key_value", past_key_value) cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) @@ -90,17 +87,15 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - # Make adjustments since (parallel) prefix tuning changes the attention mask - bsz = key_states.shape[0] - # >>> END AH Changes <<< attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + bsz = key_states.shape[0] + if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask @@ -127,7 +122,13 @@ def forward( return attn_output, attn_weights, past_key_value -class MistralFlashAttention2WithAdapters(MistralAttentionMixin, MistralFlashAttention2): +class MistralFlashAttention2WithAdapters(MistralAttentionMixin, MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + def forward( self, hidden_states: torch.Tensor, @@ -140,11 +141,9 @@ def forward( ): if isinstance(past_key_value, StaticCache): raise ValueError( - "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make" - " sure to use `sdpa` in the mean time, and open an issue at" - " https://github.com/huggingface/transformers" + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" ) - output_attentions = False bsz, q_len, _ = hidden_states.size() @@ -157,12 +156,10 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -171,6 +168,13 @@ def forward( cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + key_states, value_states, attention_mask = self.prefix_tuning( + key_states, value_states, hidden_states, attention_mask + ) + (query_states,) = adjust_tensors_for_parallel(key_states, query_states) + + bsz = key_states.shape[0] + use_sliding_windows = ( _flash_supports_window_size and getattr(self.config, "sliding_window", None) is not None @@ -179,10 +183,12 @@ def forward( if not _flash_supports_window_size: logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory" - " efficient implementation make sure to upgrade flash-attn library." + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." ) + past_key_value = getattr(self, "past_key_value", past_key_value) + if past_key_value is not None: # Activate slicing cache only if the config has a value `sliding_windows` attribute cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 @@ -201,8 +207,8 @@ def forward( if past_key.shape[-2] != self.config.sliding_window - 1: raise ValueError( - "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1," - f" head_dim`), got {past_key.shape}" + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" ) if attention_mask is not None: @@ -217,16 +223,6 @@ def forward( value_states = repeat_kv(value_states, self.num_key_value_groups) dropout_rate = 0.0 if not self.training else self.attention_dropout - # >>> START AH Changes <<< - key_states, value_states, attention_mask = self.prefix_tuning( - key_states, value_states, hidden_states, attention_mask - ) - (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - # Make adjustments since (parallel) prefix tuning changes the attention mask - kv_seq_len = key_states.shape[-2] - bsz = key_states.shape[0] - # >>> END AH Changes <<< - # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need # cast them back in float16 just to be sure everything works as expected. @@ -241,8 +237,8 @@ def forward( target_dtype = self.q_proj.weight.dtype logger.warning_once( - "The input hidden states seems to be silently casted in float32, this might be related to the fact" - " you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" f" {target_dtype}." ) @@ -274,7 +270,15 @@ def forward( return attn_output, attn_weights, past_key_value -class MistralSdpaAttentionWithAdapters(MistralAttentionMixin, MistralSdpaAttention): +# Adapted from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +class MistralSdpaAttentionWithAdapters(MistralAttentionMixin, MistralAttention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward def forward( self, hidden_states: torch.Tensor, @@ -289,10 +293,8 @@ def forward( if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention`" - " does not support `output_attentions=True`. Falling back to the manual attention implementation, but" - " specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This" - ' warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( hidden_states=hidden_states, @@ -314,12 +316,10 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # >>> START AH Changes <<< query_states, key_states, value_states = match_attn_matrices_for_parallel( query_states, key_states, value_states ) (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - # >>> END AH Changes <<< cos, sin = self.rotary_emb(value_states, position_ids) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) @@ -332,14 +332,10 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - # >>> START AH Changes <<< key_states, value_states, attention_mask = self.prefix_tuning( key_states, value_states, hidden_states, attention_mask ) (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - # Make adjustments since (parallel) prefix tuning changes the attention mask - bsz = key_states.shape[0] - # >>> END AH Changes <<< causal_mask = attention_mask if attention_mask is not None: @@ -379,7 +375,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, + past_key_value: Optional[Cache] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, @@ -403,9 +399,7 @@ def forward( kwargs (`dict`, *optional*): Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model - """ - - adjust_tensors_for_parallel_(hidden_states, attention_mask, position_ids) + """ # adjust_tensors_for_parallel(hidden_states, attention_mask, position_ids) residual = hidden_states hidden_states = self.input_layernorm(hidden_states) diff --git a/src/adapters/models/mt5/adapter_model.py b/src/adapters/models/mt5/adapter_model.py index 418b47b13..2868aec3e 100644 --- a/src/adapters/models/mt5/adapter_model.py +++ b/src/adapters/models/mt5/adapter_model.py @@ -81,7 +81,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -161,7 +161,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs, + **kwargs ): # cut decoder_input_ids if past is used if past is not None: diff --git a/src/adapters/models/mt5/modeling_mt5.py b/src/adapters/models/mt5/modeling_mt5.py index b982d34d6..12ad630a7 100644 --- a/src/adapters/models/mt5/modeling_mt5.py +++ b/src/adapters/models/mt5/modeling_mt5.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch MT5 model.""" +""" PyTorch MT5 model.""" import torch from torch import nn diff --git a/src/adapters/models/plbart/__init__.py b/src/adapters/models/plbart/__init__.py deleted file mode 100644 index 1160ba151..000000000 --- a/src/adapters/models/plbart/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -# flake8: noqa -# There's no way to ignore "F401 '...' imported but unused" warnings in this -# module, but to preserve other warnings. So, don't check this module at all. - -# Copyright 2020 The Adapter-Hub Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import TYPE_CHECKING - -from transformers.utils import _LazyModule - - -_import_structure = { - "adapter_model": ["PLBartAdapterModel"], -} - - -if TYPE_CHECKING: - from .adapter_model import PLBartAdapterModel - -else: - import sys - - sys.modules[__name__] = _LazyModule( - __name__, - globals()["__file__"], - _import_structure, - ) diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py deleted file mode 100644 index 2aaaf0b9f..000000000 --- a/src/adapters/models/plbart/adapter_model.py +++ /dev/null @@ -1,162 +0,0 @@ -import torch - -from transformers.models.plbart.modeling_plbart import ( - PLBART_INPUTS_DOCSTRING, - PLBART_START_DOCSTRING, - PLBartConfig, - PLBartModel, - PLBartPreTrainedModel, - shift_tokens_right, -) -from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward - -from ...heads import ModelWithFlexibleHeadsAdaptersMixin -from ...model_mixin import EmbeddingAdaptersWrapperMixin -from ...wrappers import init - - -@add_start_docstrings( - "PLBART Model with the option to add multiple flexible prediction heads on top.", PLBART_START_DOCSTRING -) -class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPreTrainedModel): - _tied_weights_keys = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] - - head_types = [ - "classification", - "multilabel_classification", - "question_answering", - "seq2seq_lm", - ] - - def __init__(self, config: PLBartConfig, **kwargs): - super().__init__(config, **kwargs) - self.model = PLBartModel(config) - init(self.model) - - self._init_head_modules() - - self.post_init() - - def get_encoder(self): - return self.model.get_encoder() - - def get_decoder(self): - return self.model.get_decoder() - - @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - decoder_input_ids=None, - decoder_attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - encoder_outputs=None, - inputs_embeds=None, - decoder_inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - past_key_values=None, - head=None, - output_adapter_gating_scores=False, - output_adapter_fusion_attentions=False, - **kwargs, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., - config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs: - use_cache = False - - outputs, context = self.model( - input_ids, - attention_mask=attention_mask, - decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - head_mask=head_mask, - decoder_head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - past_key_values=past_key_values, - output_adapter_gating_scores=output_adapter_gating_scores, - output_adapter_fusion_attentions=output_adapter_fusion_attentions, - adapter_input_parallelized=kwargs.pop("adapter_input_parallelized", False), - output_context=True, - ) - # required e.g. for prompt tuning in all models - kwargs["context"] = context - - head_outputs = self.forward_head( - outputs, - head_name=head, - attention_mask=attention_mask, - return_dict=return_dict, - get_cls_from_eos_tokens=True, - # `get_cls_from_eos_tokens` requires passing eos mask - eos_mask=input_ids.eq(self.config.eos_token_id) if input_ids is not None else None, - **kwargs, - ) - - return head_outputs - - # Copied from PLBartForConditionalGeneration - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - "adapter_input_parallelized": kwargs.pop("adapter_input_parallelized", False), - } - - # Copied from PLBartForConditionalGeneration - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): - return shift_tokens_right(labels, self.config.pad_token_id) # , self.config.decoder_start_token_id) - - # Copied from PLBartForConditionalGeneration - @staticmethod - def _reorder_cache(past, beam_idx): - reordered_past = () - for layer_past in past: - # cached cross_attention states don't have to be reordered -> they are always the same - reordered_past += ( - tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:], - ) - return reordered_past diff --git a/src/adapters/models/plbart/mixin_plbart.py b/src/adapters/models/plbart/mixin_plbart.py deleted file mode 100644 index bd02e04de..000000000 --- a/src/adapters/models/plbart/mixin_plbart.py +++ /dev/null @@ -1,109 +0,0 @@ -from typing import Iterable, Optional, Tuple - -import torch -import torch.nn as nn - -from ...composition import adjust_tensors_for_parallel -from ...methods.bottleneck import BottleneckLayer -from ...methods.lora import LoRALinear -from ...methods.prefix_tuning import PrefixTuningLayer -from ...model_mixin import ( - EmbeddingAdaptersMixin, - EmbeddingAdaptersWrapperMixin, - InvertibleAdaptersMixin, - InvertibleAdaptersWrapperMixin, - ModelBaseAdaptersMixin, -) - - -class PLBartAttentionAdaptersMixin: - """Adds adapters to the BartAttention module.""" - - def init_adapters(self, model_config, adapters_config): - # Wrap layers for LoRA - self.k_proj = LoRALinear.wrap(self.k_proj, "selfattn", model_config, adapters_config, attn_key="k") - self.v_proj = LoRALinear.wrap(self.v_proj, "selfattn", model_config, adapters_config, attn_key="v") - self.q_proj = LoRALinear.wrap(self.q_proj, "selfattn", model_config, adapters_config, attn_key="q") - - self.prefix_tuning = PrefixTuningLayer( - self.location_key + "_prefix" if self.location_key else None, model_config, adapters_config - ) - - -class PLBartEncoderLayerAdaptersMixin: - """Adds adapters to the PLBartEncoderLayer module of PLBART.""" - - def init_adapters(self, model_config, adapters_config): - self.adapters_config = adapters_config - # Wrap layers for LoRA - self.fc1 = LoRALinear.wrap(self.fc1, "intermediate", model_config, adapters_config) - self.fc2 = LoRALinear.wrap(self.fc2, "output", model_config, adapters_config) - - # Set attention layer location key for prefix tuning - self.self_attn.location_key = "encoder" - self.attention_adapters = BottleneckLayer("mh_adapter") - self.output_adapters = BottleneckLayer("output_adapter") - - -class PLBartDecoderLayerAdaptersMixin(PLBartEncoderLayerAdaptersMixin): - """Adds adapters to the PLBartDecoderLayer module of PLBART.""" - - def init_adapters(self, model_config, adapters_config): - super().init_adapters(model_config, adapters_config) - # Set attention layer location key for prefix tuning - self.self_attn.location_key = "self" - self.encoder_attn.location_key = "cross" - self.cross_attention_adapters = BottleneckLayer("cross_adapter") - - -class PLBartEncoderAdaptersMixin(InvertibleAdaptersMixin): - """Adds adapters to the PLBartEncoder module of PLBART.""" - - pass - - -class PLBartDecoderAdaptersMixin: - """Adds adapters to the PLBartDecoder module of PLBART.""" - - def forward( - self, input_ids: torch.LongTensor = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, **kwargs - ): - (input_ids,) = adjust_tensors_for_parallel(encoder_hidden_states, input_ids) - return super().forward(input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, **kwargs) - - -class PLBartModelAdaptersMixin(EmbeddingAdaptersMixin, InvertibleAdaptersWrapperMixin, ModelBaseAdaptersMixin): - """Adds adapters to the PLBartModel class.""" - - invertible_adapters_base_name = "encoder" - support_prompt_tuning = False - - def init_adapters(self, model_config, adapters_config): - super().init_adapters(model_config, adapters_config) - self.encoder.layernorm_embedding.register_forward_hook(self.post_embedding_forward) - - def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]: - if hasattr(self, "encoder"): - for i, layer in enumerate(self.encoder.layers): - yield i, layer - for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)): - yield i, layer - else: - for i, layer in enumerate(self.decoder.layers): - yield i, layer - - def post_embedding_forward(self, module, args, embedding_output): - embedding_output = self.invertible_adapters_forward(embedding_output) - # Prompt tuning not yet supported - return embedding_output - - -class PLBartDecoderWrapperAdaptersMixin(EmbeddingAdaptersWrapperMixin, ModelBaseAdaptersMixin): - """Adds adapters to the PLBartDecoderWrapper class.""" - - def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]: - for i, layer in enumerate(self.decoder.layers): - yield i, layer - - def get_input_embeddings(self): - return self.decoder.get_input_embeddings() diff --git a/src/adapters/models/plbart/modeling_plbart.py b/src/adapters/models/plbart/modeling_plbart.py deleted file mode 100644 index 2d812cae1..000000000 --- a/src/adapters/models/plbart/modeling_plbart.py +++ /dev/null @@ -1,537 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch PLBART model.""" -from typing import Optional, Tuple - -import torch -import torch.utils.checkpoint -from torch import nn - -from transformers.models.plbart.modeling_plbart import PLBartAttention, PLBartDecoderLayer, PLBartEncoderLayer -from transformers.utils import logging - -from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel -from .mixin_plbart import ( - PLBartAttentionAdaptersMixin, - PLBartDecoderLayerAdaptersMixin, - PLBartEncoderLayerAdaptersMixin, -) - - -logger = logging.get_logger(__name__) - - -class PLBartAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, tgt_len, _ = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) * self.scaling - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] - elif is_cross_attention: - # cross_attentions - key_states = self._shape(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - else: - # self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - - query_states, key_states, value_states = match_attn_matrices_for_parallel( - query_states, key_states, value_states - ) - (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - key_states, value_states, attention_mask = self.prefix_tuning( - key_states, value_states, hidden_states, attention_mask - ) - (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - bsz = query_states.size(0) - - proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) - key_states = key_states.reshape(*proj_shape) - value_states = value_states.reshape(*proj_shape) - - src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if layer_head_mask is not None: - if layer_head_mask.size() != (self.num_heads,): - raise ValueError( - f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" - f" {layer_head_mask.size()}" - ) - attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to be reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned across GPUs when using tensor-parallelism. - attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped, past_key_value - - -class PLBartFlashAttention2WithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention): - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # PLBartFlashAttention2 attention does not support output_attentions - if output_attentions: - raise ValueError("PLBartFlashAttention2 attention does not support output_attentions") - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, q_len, _ = hidden_states.size() - - # get query proj - query_states = self._reshape(self.q_proj(hidden_states), -1, bsz) - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0].transpose(1, 2) - value_states = past_key_value[1].transpose(1, 2) - elif is_cross_attention: - # cross_attentions - key_states = self._reshape(self.k_proj(key_value_states), -1, bsz) - value_states = self._reshape(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) - value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) - value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) - else: - # self_attention - key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) - value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) - - query_states, key_states, value_states = match_attn_matrices_for_parallel( - query_states, key_states, value_states - ) - (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) - - key_states, value_states, attention_mask = self.prefix_tuning( - key_states, value_states, hidden_states, attention_mask - ) - (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - bsz = query_states.size(0) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (LlamaRMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - # Handle the case where the model is quantized - if hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - "The input hidden states seems to be silently casted in float32, this might be related to the fact" - " you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout - ) - - attn_output = attn_output.reshape(bsz, q_len, -1) - attn_output = self.out_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class PLBartSdpaAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention): - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - if output_attentions or layer_head_mask is not None: - # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "PLBartModel is using PLBartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does" - " not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual" - " attention implementation, but specifying the manual implementation will be required from" - " Transformers version v5.0.0 onwards. This warning can be removed using the argument" - ' `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states, - key_value_states=key_value_states, - past_key_value=past_key_value, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, tgt_len, _ = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] - elif is_cross_attention: - # cross_attentions - key_states = self._shape(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - else: - # self_attention - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - - query_states, key_states, value_states = match_attn_matrices_for_parallel( - query_states, key_states, value_states - ) - (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - key_states, value_states, attention_mask = self.prefix_tuning( - key_states, value_states, hidden_states, attention_mask - ) - (query_states,) = adjust_tensors_for_parallel(key_states, query_states) - bsz = query_states.size(0) - - query_states = self._shape(query_states, tgt_len, bsz) - - # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, - # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.dropout if self.training else 0.0, - # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. - is_causal=self.is_causal and attention_mask is None and tgt_len > 1, - ) - - if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned across GPUs when using tensor-parallelism. - attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, None, past_key_value - - -class PLBartEncoderLayerWithAdapters(PLBartEncoderLayerAdaptersMixin, PLBartEncoderLayer): - def forward( - self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - layer_head_mask: torch.FloatTensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size - `(encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - adjust_tensors_for_parallel_(hidden_states, attention_mask) - - residual = hidden_states - hidden_states, attn_weights, _ = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm) - - residual = hidden_states - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm) - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() - ): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class PLBartDecoderLayerWithAdapters(PLBartDecoderLayerAdaptersMixin, PLBartDecoderLayer): - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - cross_attn_layer_head_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = True, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(batch, seq_len, embed_dim)` - encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size - `(encoder_attention_heads,)`. - cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of - size `(decoder_attention_heads,)`. - past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - adjust_tensors_for_parallel_(hidden_states, attention_mask, encoder_attention_mask) - - residual = hidden_states - - # Self Attention - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None - # add present self-attn cache to positions 1,2 of present_key_value tuple - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - past_key_value=self_attn_past_key_value, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm) - - # Cross-Attention Block - cross_attn_present_key_value = None - cross_attn_weights = None - if encoder_hidden_states is not None: - residual = hidden_states - - # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple - cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None - hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( - hidden_states=hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = self.cross_attention_adapters(hidden_states, residual, self.encoder_attn_layer_norm) - - # add cross-attn to positions 3,4 of present_key_value tuple - present_key_value = present_key_value + cross_attn_present_key_value - - # Fully Connected - residual = hidden_states - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) - - if use_cache: - outputs += (present_key_value,) - - return outputs diff --git a/src/adapters/models/roberta/adapter_model.py b/src/adapters/models/roberta/adapter_model.py index ab9411ef7..87858566b 100644 --- a/src/adapters/models/roberta/adapter_model.py +++ b/src/adapters/models/roberta/adapter_model.py @@ -53,7 +53,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py index 5aa7aff4f..b544252ce 100644 --- a/src/adapters/models/t5/adapter_model.py +++ b/src/adapters/models/t5/adapter_model.py @@ -74,7 +74,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -154,7 +154,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs, + **kwargs ): # cut decoder_input_ids if past is used if past is not None: diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py index c98cfa477..03d9f2797 100644 --- a/src/adapters/models/t5/modeling_t5.py +++ b/src/adapters/models/t5/modeling_t5.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch T5 model.""" +""" PyTorch T5 model.""" import torch from torch import nn diff --git a/src/adapters/models/vit/modeling_vit.py b/src/adapters/models/vit/modeling_vit.py index 323fb6cab..f8c02bd93 100644 --- a/src/adapters/models/vit/modeling_vit.py +++ b/src/adapters/models/vit/modeling_vit.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch ViT model.""" +""" PyTorch ViT model.""" import math @@ -23,7 +23,7 @@ from torch import nn from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel -from transformers.models.vit.modeling_vit import ViTLayer, ViTOutput, ViTSdpaSelfAttention, ViTSelfAttention +from transformers.models.vit.modeling_vit import ViTLayer, ViTOutput, ViTSelfAttention from .mixin_vit import ViTLayerAdaptersMixin, ViTOutputAdaptersMixin, ViTSelfAttentionAdaptersMixin @@ -70,38 +70,6 @@ def forward( return outputs -class ViTSdpaSelfAttentionWithAdapters(ViTSelfAttentionAdaptersMixin, ViTSdpaSelfAttention): - def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False - ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - mixed_query_layer = self.query(hidden_states) - - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - query_layer = self.transpose_for_scores(mixed_query_layer) - - query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer) - - key_layer, value_layer, _ = self.prefix_tuning(key_layer, value_layer, hidden_states) - (query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer) - - context_layer = torch.nn.functional.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - head_mask, - self.attention_probs_dropout_prob if self.training else 0.0, - is_causal=False, - scale=None, - ) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) - - return context_layer, None - - class ViTOutputWithAdapters(ViTOutputAdaptersMixin, ViTOutput): def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(hidden_states) diff --git a/src/adapters/models/xlm_roberta/adapter_model.py b/src/adapters/models/xlm_roberta/adapter_model.py index 1cab4aaac..8acfde792 100644 --- a/src/adapters/models/xlm_roberta/adapter_model.py +++ b/src/adapters/models/xlm_roberta/adapter_model.py @@ -56,7 +56,7 @@ def forward( head=None, output_adapter_gating_scores=False, output_adapter_fusion_attentions=False, - **kwargs, + **kwargs ): input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None diff --git a/src/adapters/models/xmod/adapter_model.py b/src/adapters/models/xmod/adapter_model.py index a179fc6be..94cc43f71 100644 --- a/src/adapters/models/xmod/adapter_model.py +++ b/src/adapters/models/xmod/adapter_model.py @@ -59,7 +59,7 @@ def forward( head: Optional[str] = None, output_adapter_gating_scores: Optional[bool] = False, output_adapter_fusion_attentions: Optional[bool] = False, - **kwargs, + **kwargs ): # Flatten for multiple choice tasks input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None diff --git a/src/adapters/trainer.py b/src/adapters/trainer.py deleted file mode 100644 index 6be5b3ee7..000000000 --- a/src/adapters/trainer.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -import re -from typing import Callable, Dict, List, Optional, Tuple, Union - -import torch -from torch import nn -from torch.utils.data.dataset import Dataset - -from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ -from transformers.configuration_utils import PretrainedConfig -from transformers.data.data_collator import DataCollator -from transformers.modeling_utils import unwrap_model -from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState -from transformers.trainer_utils import EvalPrediction -from transformers.training_args import TrainingArguments -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled, logging - -from .composition import AdapterCompositionBlock, Fuse - - -if is_sagemaker_mp_enabled(): - import smdistributed.modelparallel.torch as smp - - -logger = logging.get_logger(__name__) - - -class AdapterTrainer(Trainer): - def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), - preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, - ): - super().__init__( - model, - args, - data_collator, - train_dataset, - eval_dataset, - tokenizer=tokenizer, - model_init=model_init, - compute_metrics=compute_metrics, - callbacks=[AdapterTrainerCallback(self)] + callbacks if callbacks else [AdapterTrainerCallback(self)], - optimizers=optimizers, - preprocess_logits_for_metrics=preprocess_logits_for_metrics, - ) - - if adapter_names is not None: - self.model.set_active_adapters(adapter_names) - # Set the defaults for loading/ saving model & adapters - if isinstance(self.model, PreTrainedModel): - model_frozen = getattr(self.model.base_model, "model_frozen", False) - else: - model_frozen = False - if model_frozen and self.model.active_adapters: - # Check if training AdapterFusion - self.train_adapter_fusion = ( - isinstance(self.model.active_adapters, Fuse) - or isinstance(self.model.active_adapters, AdapterCompositionBlock) - and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) - ) - if self.model.active_adapters is None: - raise ValueError( - "Expected a model with an active adapter setup." - "If you want to fully finetune the model use the Trainer class." - ) - if (self.label_names is None or len(self.label_names) < 1) and self.model.active_head is not None: - all_label_names = set() - for head in self.model._active_heads: - all_label_names |= set(self.model.heads[head].get_label_names()) - self.label_names = list(all_label_names) - - def create_optimizer(self): - """ - Setup the optimizer. - - We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the - Trainer's init through `optimizers`, or subclass and override this method in a subclass. - """ - opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model - - if self.optimizer is None: - decay_parameters = self.get_decay_parameter_names(opt_model) - if hasattr(self.model, "config") and hasattr(self.model.config, "adapters"): - match_str = r"adapter_fusion_layer\..*\.value" - decay_parameters = [name for name in decay_parameters if not re.match(match_str, name)] - optimizer_grouped_parameters = [ - { - "params": [ - p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad) - ], - "weight_decay": self.args.weight_decay, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad) - ], - "weight_decay": 0.0, - }, - ] - - optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) - self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) - - if is_sagemaker_mp_enabled(): - self.optimizer = smp.DistributedOptimizer(self.optimizer) - - return self.optimizer - - def _save(self, output_dir: Optional[str] = None, state_dict=None): - # If we are executing this function, we are the process zero, so we don't check for that. - output_dir = output_dir if output_dir is not None else self.args.output_dir - os.makedirs(output_dir, exist_ok=True) - logger.info(f"Saving model checkpoint to {output_dir}") - # Save a trained model and configuration using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - if not isinstance(self.model, PreTrainedModel): - if isinstance(unwrap_model(self.model), PreTrainedModel): - if state_dict is None: - state_dict = self.model.state_dict() - unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict) - else: - logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") - if state_dict is None: - state_dict = self.model.state_dict() - torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) - else: - self.model.save_all_adapters(output_dir) - if self.train_adapter_fusion: - self.model.save_all_adapter_fusions(output_dir) - if hasattr(self.model, "heads"): - self.model.save_all_heads(output_dir) - if self.tokenizer is not None: - self.tokenizer.save_pretrained(output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - - def _load_from_checkpoint(self, resume_from_checkpoint): - args = self.args - if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): - logger.info(f"Loading model from {resume_from_checkpoint}).") - - if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): - config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) - checkpoint_version = config.transformers_version - if checkpoint_version is not None and checkpoint_version != __version__: - logger.warn( - f"You are resuming training from a checkpoint trained with {checkpoint_version} of " - f"Transformers but your current version is {__version__}. This is not recommended and could " - "yield to errors or unwanted behaviors." - ) - - if args.deepspeed: - # will be resumed in deepspeed_init - pass - else: - adapter_loaded = False - if os.path.isdir(resume_from_checkpoint): - adapter_loaded = self._load_adapters(resume_from_checkpoint) - self._load_adapter_fusions(resume_from_checkpoint) - # Save all heads for a model with heads - if hasattr(self.model, "heads"): - self._load_heads(resume_from_checkpoint) - - if not adapter_loaded: - raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) - - def _load_adapters(self, resume_from_checkpoint): - adapter_loaded = False - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): - if "," not in file_name and "adapter_config.json" in os.listdir( - os.path.join(resume_from_checkpoint, file_name) - ): - self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name))) - adapter_loaded = True - return adapter_loaded - - def _load_adapter_fusions(self, resume_from_checkpoint): - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): - if "," in file_name: - self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) - - def _load_heads(self, resume_from_checkpoint): - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): - if "," not in file_name and "head_config.json" in os.listdir( - os.path.join(resume_from_checkpoint, file_name) - ): - self.model.load_head(os.path.join(resume_from_checkpoint, file_name)) - - def _load_best_model(self): - model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model - logger.info( - f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." - ) - # attempt to re-load all adapters from checkpoint - for adapter in model.adapters_config.adapters: - adapter_dir = os.path.join(self.state.best_model_checkpoint, adapter) - if os.path.exists(adapter_dir): - model.load_adapter(adapter_dir) - model.adapter_to(adapter, device=self.args.device) - if self.train_adapter_fusion: - logger.info( - f"Loading best adapter fusion(s) from {self.state.best_model_checkpoint} (score:" - f" {self.state.best_metric})." - ) - # attempt to re-load all adapter fusions from checkpoint - for fusion in model.adapters_config.fusions: - fusion_dir = os.path.join(self.state.best_model_checkpoint, fusion) - if os.path.exists(fusion_dir): - model.load_adapter_fusion(fusion_dir) - model.adapter_fusion_to(fusion, device=self.args.device) - - -class AdapterTrainerCallback(TrainerCallback): - def __init__(self, trainer): - super().__init__() - self.trainer = trainer - - def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - model = kwargs.pop("model") - model_frozen = getattr(model.base_model, "model_frozen", False) - if not model_frozen: - raise ValueError( - "The pre-trained model weights are not frozen. For training adapters, please call the train_adapter()" - " method" - ) - - def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - # apply adapter fusion weight regularization on the value matrix - model = kwargs.pop("model") - if self.trainer.train_adapter_fusion: - fusion_reg_loss = model.base_model.get_fusion_regularization_loss() - if fusion_reg_loss is not None: - fusion_reg_loss.backward() - - -class Seq2SeqAdapterTrainer(AdapterTrainer, Seq2SeqTrainer): - pass diff --git a/src/adapters/training.py b/src/adapters/training 2.py similarity index 98% rename from src/adapters/training.py rename to src/adapters/training 2.py index 5d053affb..831601139 100644 --- a/src/adapters/training.py +++ b/src/adapters/training 2.py @@ -83,7 +83,7 @@ def setup_adapter_training( else: lang_adapter_name = None # Freeze all model weights except of those of this adapter - model.train_adapter(adapter_name) + model.train_adapter([adapter_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters(Stack(lang_adapter_name, adapter_name)) diff --git a/src/adapters/utils.py b/src/adapters/utils 2.py similarity index 77% rename from src/adapters/utils.py rename to src/adapters/utils 2.py index b85537f63..7338f4c3a 100644 --- a/src/adapters/utils.py +++ b/src/adapters/utils 2.py @@ -18,6 +18,7 @@ from os.path import basename, isdir, isfile, join from pathlib import Path from typing import Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile import torch @@ -110,11 +111,11 @@ def __repr__(self): @dataclass class AdapterInfo: """ - Holds information about an adapter publicly available on the Hub. Returned by + Holds information about an adapter publicly available on AdapterHub or huggingface.co. Returned by :func:`list_adapters()`. Args: - source (str): The source repository of this adapter. Always 'hf' for adapters available on HF Model Hub. + source (str): The source repository of this adapter. Can be either "ah" (AdapterHub) or "hf" (huggingface.co). adapter_id (str): The unique identifier of this adapter. model_name (str, optional): The identifier of the model this adapter was trained for. task (str, optional): The task this adapter was trained for. @@ -140,16 +141,14 @@ def _minimize_dict(d): return d -def get_adapter_config_hash(config, length=16, ignore_params=[]): +def get_adapter_config_hash(config, length=16): """ Calculates the hash of a given adapter configuration which is used to identify this configuration. Returns: str: The resulting hash of the given config dict. """ - minimized_config = _minimize_dict( - {k: v for (k, v) in config.items() if k not in ADAPTER_CONFIG_HASH_IGNORE + ignore_params} - ) + minimized_config = _minimize_dict({k: v for (k, v) in config.items() if k not in ADAPTER_CONFIG_HASH_IGNORE}) # ensure hash is kept consistent to previous versions for name, default in ADAPTER_CONFIG_HASH_IGNORE_DEFAULT.items(): if minimized_config.get(name, None) == default: @@ -435,7 +434,7 @@ def parse_adapter_config_string(config_string: str) -> List[Tuple[str, dict]]: return adapter_configs -def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) -> dict: +def resolve_adapter_config(config: Union[dict, str], local_map=None, try_loading_from_hub=True, **kwargs) -> dict: """ Resolves a given adapter configuration specifier to a full configuration dictionary. @@ -445,6 +444,7 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) - - a dictionary: returned without further action - an identifier string available in local_map - the path to a file containing a full adapter configuration + - an identifier string available in Adapter-Hub Returns: dict: The resolved adapter configuration dictionary. @@ -464,6 +464,13 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) - return loaded_config["config"] else: return loaded_config + # download hub index file + if try_loading_from_hub: + index_file = download_cached(ADAPTER_HUB_CONFIG_FILE, **kwargs) + if not index_file: + raise EnvironmentError("Unable to load adapter hub index file. The file might be temporarily unavailable.") + with open(index_file, "r") as f: + config_index = json.load(f) # parse the config string config_pairs = parse_adapter_config_string(config) if len(config_pairs) > 0: @@ -473,6 +480,11 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) - if local_map and name in local_map: config_obj = local_map[name] full_configs.append(config_obj.replace(**config_kwargs)) + # now, try to find in hub index + elif try_loading_from_hub and name in config_index: + config_obj = config_index[name] + config_obj.update(**config_kwargs) + full_configs.append(config_obj) else: raise ValueError("Could not identify '{}' as a valid adapter configuration.".format(name)) # Case 1: only one config, return it directly @@ -576,16 +588,34 @@ def _get_matching_version(config_entry, org): raise ValueError("Multiple adapters with this name are available for this config.") +def http_get_json(url): + # check if it's a relative url + if not urlparse(url).netloc: + url = urljoin(ADAPTER_HUB_URL, url) + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + raise EnvironmentError("Failed to get file {}".format(url)) + + +def get_checksum(file_entry: dict): + for algo in hashlib.algorithms_guaranteed: + if algo in file_entry: + return algo, file_entry[algo] + + def pull_from_hub( specifier: str, model_name: str, adapter_config: Optional[Union[dict, str]] = None, version: str = None, strict: bool = False, - **kwargs, + redirect_to_hf_hub: bool = False, + **kwargs ) -> str: """ - Redirects loading from the archived Hub repository to HuggingFace Model Hub. + Downloads a pre-trained adapter module from Adapter-Hub Args: specifier (str): A string specifying the adapter to be loaded. @@ -594,6 +624,9 @@ def pull_from_hub( version (str, optional): The version of the adapter to be loaded. Defaults to None. strict (bool, optional): If set to True, only allow adapters exactly matching the given config to be loaded. Defaults to False. + redirect_to_hf_hub (bool, optional): + If set to True, the function will redirect to the HuggingFace Model Hub instead of AdapterHub. + Defaults to False. Returns: str: The local path to which the adapter has been downloaded. @@ -609,12 +642,35 @@ def pull_from_hub( raise EnvironmentError("No adapter with name '{}' was found in the adapter index.".format(specifier)) hf_hub_specifier = "AdapterHub/" + os.path.basename(hub_entry_url).split(".")[0] - logger.warning( - "Automatic redirect to HF Model Hub repo '{}'. Please switch to the new ID to remove this warning.".format( - hf_hub_specifier + if redirect_to_hf_hub: + logger.warning( + "Automatic redirect to HF Model Hub repo '{}'. Please switch to the new ID to remove this warning.".format( + hf_hub_specifier + ) + ) + return pull_from_hf_model_hub(hf_hub_specifier, version=version, **kwargs) + else: + logger.warning( + "Loading adapters from this source is deprecated. This adapter has moved to '{}'. Please switch to the new" + " ID to remove this warning.".format(hf_hub_specifier) ) - ) - return pull_from_hf_model_hub(hf_hub_specifier, version=version, **kwargs) + + hub_entry = http_get_json(hub_entry_url) + # set version + if not version: + version = hub_entry["default_version"] + elif version not in hub_entry["files"]: + logger.warning("Version '{}' of adapter '{}' not found. Falling back to default.".format(version, specifier)) + version = hub_entry["default_version"] + file_entry = hub_entry["files"][version] + + # start downloading + logger.info("Resolved adapter files at {}.".format(file_entry["url"])) + checksum_algo, checksum = get_checksum(file_entry) + download_path = download_cached(file_entry["url"], checksum=checksum, checksum_algo=checksum_algo, **kwargs) + if not download_path: + raise EnvironmentError("Unable to load file from {}. The file might be unavailable.".format(file_entry["url"])) + return download_path def pull_from_hf_model_hub(specifier: str, version: str = None, **kwargs) -> str: @@ -633,7 +689,9 @@ def resolve_adapter_path( model_name: str = None, adapter_config: Union[dict, str] = None, version: str = None, - **kwargs, + source: str = None, + redirect_to_hf_hub: bool = False, + **kwargs ) -> str: """ Resolves the path to a pre-trained adapter module. Note: If attempting to resolve an adapter from the Hub, @@ -648,6 +706,15 @@ def resolve_adapter_path( model_name (str, optional): The identifier of the pre-trained model for which to load an adapter. adapter_config (Union[dict, str], optional): The configuration of the adapter to be loaded. version (str, optional): The version of the adapter to be loaded. Defaults to None. + source (str, optional): Identifier of the source(s) from where to get adapters. Can be either: + + - "ah": search on AdapterHub.ml. Note: this source is deprecated in favor of "hf". + - "hf": search on HuggingFace model hub (huggingface.co). + - None (default): search on all sources + + redirect_to_hf_hub (bool, optional): + If set to True, the function will redirect to the HuggingFace Model Hub instead of AdapterHub. + Defaults to False. Returns: str: The local path from where the adapter module can be loaded. @@ -672,13 +739,24 @@ def resolve_adapter_path( WEIGHTS_NAME, CONFIG_NAME, adapter_name_or_path ) ) - else: + elif source == "ah": + return pull_from_hub( + adapter_name_or_path, + model_name, + adapter_config=adapter_config, + version=version, + redirect_to_hf_hub=redirect_to_hf_hub, + **kwargs, + ) + elif source == "hf": + return pull_from_hf_model_hub(adapter_name_or_path, version=version, **kwargs) + elif source is None: try: - logger.info("Attempting to load adapter from HF Model Hub...") + logger.info("Attempting to load adapter from source 'hf'...") return pull_from_hf_model_hub(adapter_name_or_path, version=version, **kwargs) except (EnvironmentError, ValueError) as ex: logger.info(ex) - logger.info("Attempting to redirect from archived Hub repo...") + logger.info("Attempting to load adapter from source 'ah'...") try: return pull_from_hub( adapter_name_or_path, @@ -691,70 +769,103 @@ def resolve_adapter_path( except Exception as ex: logger.info(ex) raise EnvironmentError( - "Unable to load adapter {} from any source. Please check the name of the adapter or the source.".format( - adapter_name_or_path - ) + "Unable to load adapter {} from any source. Please check the name of the adapter or the source." + .format(adapter_name_or_path) ) + else: + raise ValueError("Unable to identify {} as a valid module location.".format(adapter_name_or_path)) -def list_adapters(model_name: str = None) -> List[AdapterInfo]: +def list_adapters(source: str = None, model_name: str = None) -> List[AdapterInfo]: """ Retrieves a list of all publicly available adapters on AdapterHub.ml or on huggingface.co. Args: + source (str, optional): Identifier of the source(s) from where to get adapters. Can be either: + + - "ah": search on AdapterHub.ml. + - "hf": search on HuggingFace model hub (huggingface.co). + - None (default): search on all sources + model_name (str, optional): If specified, only returns adapters trained for the model with this identifier. """ adapters = [] - if "fetch_config" in inspect.signature(HfApi.list_models).parameters: - kwargs = {"full": True, "fetch_config": True} - else: - logger.warning( - "Using old version of huggingface-hub package for fetching. Please upgrade to latest version for" - " accurate results." - ) - kwargs = {"full": True} - all_hf_adapters_data = HfApi().list_models(filter="adapters", **kwargs) - for model_info in all_hf_adapters_data: - adapter_info = AdapterInfo( - source="hf", - adapter_id=model_info.modelId, - model_name=model_info.config.get("adapters", {}).get("model_name") if model_info.config else None, - username=model_info.modelId.split("/")[0], - sha1_checksum=model_info.sha, - ) - adapters.append(adapter_info) + if source == "ah" or source is None: + try: + all_ah_adapters_file = download_cached(ADAPTER_HUB_ALL_FILE) + except requests.exceptions.HTTPError: + raise EnvironmentError( + "Unable to load list of adapters from AdapterHub.ml. The service might be temporarily unavailable." + ) + with open(all_ah_adapters_file, "r") as f: + all_ah_adapters_data = json.load(f) + adapters += [AdapterInfo(**info) for info in all_ah_adapters_data] + if source == "hf" or source is None: + if "fetch_config" in inspect.signature(HfApi.list_models).parameters: + kwargs = {"full": True, "fetch_config": True} + else: + logger.warning( + "Using old version of huggingface-hub package for fetching. Please upgrade to latest version for" + " accurate results." + ) + kwargs = {"full": True} + all_hf_adapters_data = HfApi().list_models(filter="adapters", **kwargs) + for model_info in all_hf_adapters_data: + adapter_info = AdapterInfo( + source="hf", + adapter_id=model_info.modelId, + model_name=model_info.config.get("adapters", {}).get("model_name") if model_info.config else None, + username=model_info.modelId.split("/")[0], + sha1_checksum=model_info.sha, + ) + adapters.append(adapter_info) if model_name is not None: adapters = [adapter for adapter in adapters if adapter.model_name == model_name] return adapters -def get_adapter_info(adapter_id: str) -> Optional[AdapterInfo]: +def get_adapter_info(adapter_id: str, source: str = "ah") -> Optional[AdapterInfo]: """ Retrieves information about a specific adapter. Args: adapter_id (str): The identifier of the adapter to retrieve. + source (str, optional): Identifier of the source(s) from where to get adapters. Can be either: + + - "ah": search on AdapterHub.ml. + - "hf": search on HuggingFace model hub (huggingface.co). Returns: AdapterInfo: The adapter information or None if the adapter was not found. """ - try: - model_info = HfApi().model_info(adapter_id) - return AdapterInfo( - source="hf", - adapter_id=model_info.modelId, - model_name=( - model_info.config.get("adapter_transformers", {}).get("model_name") if model_info.config else None - ), - username=model_info.modelId.split("/")[0], - sha1_checksum=model_info.sha, - ) - except requests.exceptions.HTTPError: - return None + if source == "ah": + if adapter_id.startswith("@"): + adapter_id = adapter_id[1:] + try: + data = http_get_json(f"/adapters/{adapter_id}.json") + return AdapterInfo(**data["info"]) + except EnvironmentError: + return None + elif source == "hf": + try: + model_info = HfApi().model_info(adapter_id) + return AdapterInfo( + source="hf", + adapter_id=model_info.modelId, + model_name=model_info.config.get("adapter_transformers", {}).get("model_name") + if model_info.config + else None, + username=model_info.modelId.split("/")[0], + sha1_checksum=model_info.sha, + ) + except requests.exceptions.HTTPError: + return None + else: + raise ValueError("Please specify either 'ah' or 'hf' as source.") -def prefix_attention_mask(attention_mask, dim: Union[int, List[int]] = 3, prefix_value: int = 0): +def prefix_attention_mask(attention_mask, dim: int = 3, prefix_value: int = 0): """ Adds a prefix to an attention mask. The length of the prefix is determined by the `prefix_attention_mask_length` attribute in the ForwardContext. @@ -779,21 +890,18 @@ def prefix_attention_mask(attention_mask, dim: Union[int, List[int]] = 3, prefix and forward_context is not None and getattr(forward_context, "prompt_tokens_length", None) is not None ): - if isinstance(dim, int): - dim = [dim] - for d in dim: - # Create a tensor of ones with the desired shape - ones_shape = list(attention_mask.shape) - ones_shape[d] = forward_context.prompt_tokens_length - - prefix_attention_mask = torch.full( - ones_shape, - prefix_value, - dtype=attention_mask.dtype, - ).to(attention_mask.device) - - # Concatenate the prefix_attention_mask along the specified dimension - attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=d) + # Create a tensor of ones with the desired shape + ones_shape = list(attention_mask.shape) + ones_shape[dim] = forward_context.prompt_tokens_length + + prefix_attention_mask = torch.full( + ones_shape, + prefix_value, + dtype=attention_mask.dtype, + ).to(attention_mask.device) + + # Concatenate the prefix_attention_mask along the specified dimension + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=dim) return attention_mask diff --git a/src/adapters/wrappers/configuration.py b/src/adapters/wrappers/configuration.py index ed224cd60..c49f3b8b7 100644 --- a/src/adapters/wrappers/configuration.py +++ b/src/adapters/wrappers/configuration.py @@ -46,12 +46,6 @@ "hidden_dropout_prob": "dropout", "attention_probs_dropout_prob": "attention_dropout", }, - "plbart": { - "num_attention_heads": "encoder_attention_heads", - "hidden_size": "d_model", - "hidden_dropout_prob": "dropout", - "attention_probs_dropout_prob": "attention_dropout", - }, "roberta": {}, "t5": { "hidden_size": "d_model", diff --git a/src/adapters/wrappers/model.py b/src/adapters/wrappers/model.py index 12ed79e12..23998a81b 100644 --- a/src/adapters/wrappers/model.py +++ b/src/adapters/wrappers/model.py @@ -95,7 +95,7 @@ def load_model( model_name_or_path: Optional[Union[str, os.PathLike]], model_class: Type[PreTrainedModel], *model_args: Any, - **kwargs: Any, + **kwargs: Any ) -> PreTrainedModel: """ Loads a pretrained model with adapters from the given path or url. diff --git a/tests/fixtures/samples/cifar10/cifar10.py b/tests/fixtures/samples/cifar10/cifar10.py index 052a203df..cd00f0260 100644 --- a/tests/fixtures/samples/cifar10/cifar10.py +++ b/tests/fixtures/samples/cifar10/cifar10.py @@ -1,7 +1,6 @@ """ CIFAR-10 demo data, adapted from https://huggingface.co/datasets/cifar10. """ - import os import pickle diff --git a/tests/methods/base.py b/tests/methods/base.py index 6ede68f2f..3954aece4 100644 --- a/tests/methods/base.py +++ b/tests/methods/base.py @@ -46,7 +46,7 @@ def run_add_test(self, model, adapter_config, filter_keys): name = "test_adapter_" + adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) model.to(torch_device) # adapter is correctly added to config @@ -67,7 +67,7 @@ def run_leave_out_test(self, model, adapter_config, leave_out): adapter_config = adapter_config.replace(leave_out=leave_out) name = "test_adapter_" + adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) # adapter is correctly added to config self.assert_adapter_available(model, name) @@ -81,10 +81,10 @@ def run_leave_out_test(self, model, adapter_config, leave_out): model.delete_adapter(name) - def run_linear_average_test(self, model, adapter_config, filter_keys): + def run_average_test(self, model, adapter_config, filter_keys): model.eval() - weights = [-0.2, 0.9, 0.3] + weights = [0.1, 0.6, 0.3] # add adapters to average name = "test_adapter_" + adapter_config.__class__.__name__ @@ -103,9 +103,7 @@ def run_linear_average_test(self, model, adapter_config, filter_keys): averaged_weights[base_k] += w * v # average adapters - model.average_adapter( - name, [name + f"_{i}" for i in range(len(weights))], weights=weights, combine_strategy="linear" - ) + model.average_adapter(name, [name + f"_{i}" for i in range(len(weights))], weights=weights) # adapter is correctly added to config self.assertTrue(name in model.adapters_config) @@ -121,7 +119,7 @@ def run_delete_test(self, model, adapter_config, filter_keys): name = "test_adapter_" + adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) model.to(torch_device) # adapter is correctly added to config @@ -142,7 +140,7 @@ def run_get_test(self, model, adapter_config, num_expected_modules): model.eval() model.add_adapter("first", config=adapter_config) - model.set_active_adapters("first") + model.set_active_adapters(["first"]) # adapter is correctly added to config name = "first" @@ -167,7 +165,7 @@ def run_forward_test(self, model, adapter_config, dtype=torch.float32): input_data = self.get_input_samples(config=model.config, dtype=dtype) # pass 1: set adapter via property - model.set_active_adapters(name) + model.set_active_adapters([name]) output_1 = model(**input_data) # pass 2: set via context @@ -191,7 +189,7 @@ def run_load_test(self, adapter_config): name = "dummy_adapter" model1.add_adapter(name, config=adapter_config) - model1.set_active_adapters(name) + model1.set_active_adapters([name]) with tempfile.TemporaryDirectory() as temp_dir: model1.save_adapter(temp_dir, name) @@ -245,7 +243,7 @@ def run_full_model_load_test(self, adapter_config): output1 = model1(**input_data) output2 = model2(**input_data) self.assertEqual(len(output1), len(output2)) - self.assertTrue(torch.allclose(output1[0], output2[0], atol=1e-4)) + self.assertTrue(torch.equal(output1[0], output2[0])) def trainings_run(self, model, lr=1.0, steps=8): # setup dataset @@ -333,7 +331,7 @@ def run_merge_test(self, adapter_config): input_data = self.get_input_samples(config=model.config) # forward in training mode - model.set_active_adapters("test_lora") + model.set_active_adapters(["test_lora"]) output_1 = model(**input_data) # forward in merged mode diff --git a/tests/methods/test_adapter_common.py b/tests/methods/test_adapter_common.py index 1ea6cd6f3..b1d67757b 100644 --- a/tests/methods/test_adapter_common.py +++ b/tests/methods/test_adapter_common.py @@ -53,13 +53,13 @@ def test_leave_out_adapter(self): with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): self.run_leave_out_test(model, adapter_config, self.leave_out_layers) - def test_linear_average_adapter(self): + def test_average_adapter(self): model = self.get_model() model.eval() for adapter_config, filter_keys in self.adapter_configs_to_test: with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): - self.run_linear_average_test(model, adapter_config, filter_keys) + self.run_average_test(model, adapter_config, filter_keys) def test_delete_adapter(self): model = self.get_model() @@ -79,7 +79,7 @@ def test_add_adapter_with_invertible(self): with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): name = adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) # adapter is correctly added to config self.assertTrue(name in model.adapters_config) @@ -128,7 +128,7 @@ def test_delete_adapter_with_invertible(self): with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): name = adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) # check if adapter is correctly added to config self.assert_adapter_available(model, name) @@ -178,7 +178,7 @@ def test_add_adapter_multiple_reduction_factors(self): with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): name = adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) # adapter is correctly added to config self.assertTrue(name in model.adapters_config) diff --git a/tests/methods/test_compacter.py b/tests/methods/test_compacter.py index 75716ffa6..ffe7e0eae 100644 --- a/tests/methods/test_compacter.py +++ b/tests/methods/test_compacter.py @@ -14,11 +14,9 @@ def test_leave_out_compacter(self): model = self.get_model() self.run_leave_out_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), self.leave_out_layers) - def test_linear_average_compacter(self): + def test_average_compacter(self): model = self.get_model() - self.run_linear_average_test( - model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), ["adapters.{name}."] - ) + self.run_average_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), ["adapters.{name}."]) def test_delete_compacter(self): model = self.get_model() diff --git a/tests/methods/test_ia3.py b/tests/methods/test_ia3.py index 3a30e2448..0dc81d02b 100644 --- a/tests/methods/test_ia3.py +++ b/tests/methods/test_ia3.py @@ -14,9 +14,9 @@ def test_leave_out_ia3(self): model = self.get_model() self.run_leave_out_test(model, IA3Config(), self.leave_out_layers) - def test_linear_average_ia3(self): + def test_average_ia3(self): model = self.get_model() - self.run_linear_average_test(model, IA3Config(), ["loras.{name}."]) + self.run_average_test(model, IA3Config(), ["loras.{name}."]) def test_delete_ia3(self): model = self.get_model() diff --git a/tests/methods/test_lora.py b/tests/methods/test_lora.py index 067f78c8b..0ade2bdbb 100644 --- a/tests/methods/test_lora.py +++ b/tests/methods/test_lora.py @@ -1,9 +1,4 @@ -import random - -import torch - from adapters import LoRAConfig -from adapters.methods.lora import LoRALayer from transformers.testing_utils import require_torch from .base import AdapterMethodBaseTestMixin @@ -19,268 +14,9 @@ def test_leave_out_lora(self): model = self.get_model() self.run_leave_out_test(model, LoRAConfig(), self.leave_out_layers) - def test_merging_with_other_adapters(self): - model = self.get_model() - model.add_adapter("lora", config="lora") - - # Add different adapters - model.add_adapter("bottleneck", config="seq_bn") - model.add_adapter("prompt", config="prompt_tuning") - model.add_adapter("prefix", config="prefix_tuning") - model.add_adapter("ia3", config="ia3") - model.add_adapter("unipelt", config="unipelt") - model.add_adapter("mam", config="mam") - model.add_adapter("compacter", config="compacter[phm_dim=2, reduction_factor=8]") - - # Merging adapters with different architectures with LoRA should raise a ValueError - for adapter_architecture in ["bottleneck", "prompt", "prefix", "ia3", "unipelt", "mam", "compacter"]: - with self.subTest(adapter_architecture=adapter_architecture): - with self.assertRaises(ValueError): - model.average_adapter( - adapter_name=f"average_lora_{adapter_architecture}", - adapter_list=[adapter_architecture, "lora"], - weights=[0.5, 0.5], - combine_strategy="linear", - ) - - def test_linear_average_lora(self): - model = self.get_model() - self.run_linear_average_test(model, LoRAConfig(), ["loras.{name}."]) - - def test_linear_average_only_negate_b_lora(self): - # This method tests that the linear average following the Zhang et al. 2023 paper works as expected. - # Paper: https://proceedings.neurips.cc/paper_files/paper/2023/hash/299a08ee712d4752c890938da99a77c6-Abstract-Conference.html - # This method is an adapted version of the `run_linear_average_test` method. - model = self.get_model() - model.eval() - weights = [-1, 1.5, 0.5] - - # add adapters to average - name = "test_adapter_" + LoRAConfig().__class__.__name__ - for i in range(len(weights)): - model.add_adapter( - f"{name}_{i}", - config=LoRAConfig( - dropout=random.random(), - init_weights=["bert", "lora"][i % 2], - ), - ) - - averaged_weights = {} - for i, w in enumerate(weights): - this_filter_keys = [k.format(name=f"{name}_{i}") for k in ["loras.{name}."]] - for k, v in self.filter_parameters(model, this_filter_keys).items(): - base_k = k.replace(f"{name}_{i}", name) - # Only negate the lora_B weights and use the absolute value of the weight for lora_A weights. - weight = abs(w) if "lora_A" in k else w - if base_k not in averaged_weights: - averaged_weights[base_k] = weight * v - else: - averaged_weights[base_k] += weight * v - - # average adapters - model.average_adapter( - name, - [f"{name}_{i}" for i in range(len(weights))], - weights=weights, - combine_strategy="lora_linear_only_negate_b", - ) - - # adapter is correctly added to config - self.assertTrue(name in model.adapters_config) - config = model.adapters_config.get(name) - self.assertEqual(LoRAConfig(dropout=config.dropout, init_weights=config.init_weights), config) - - # compare averaged weights to collected weights - this_filter_keys = [k.format(name=name) for k in ["loras.{name}."]] - for k, v in self.filter_parameters(model, this_filter_keys).items(): - self.assertTrue(torch.allclose(v, averaged_weights[k]), k) - - def _check_svd_weights(self, delta_w, merged_lora, svd_rank, atol=1e-5): - # Compute SVD of the original delta_w - u, s, v = torch.svd(delta_w) - u = u[:, :svd_rank] - s = s[:svd_rank] - v = v[:, :svd_rank] - - # Reconstruct A and B matrices - expected_A = v.t() - expected_B = u @ torch.diag(s) - - # Compare with merged adapter - self.assertTrue(torch.allclose(expected_A, merged_lora.lora_A, atol=atol)) - self.assertTrue(torch.allclose(expected_B, merged_lora.lora_B, atol=atol)) - - def test_linear_delta_w_svd_average_lora(self): - model = self.get_model() - model.eval() - model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd - weights = [-1, 1.5, 0.5] - - # add adapters to average - name = "test_adapter_" + LoRAConfig().__class__.__name__ - for i in range(len(weights)): - model.add_adapter( - f"{name}_{i}", - config=LoRAConfig( - dropout=random.random(), - init_weights=["bert", "lora"][i % 2], - ), - ) - - if not model_supports_lora_delta_w_svd: - # Some models (GPT2, Deberta) don't support this merging method - with self.assertRaises(ValueError): - model.average_adapter( - "averaged_adapter", - [f"{name}_{i}" for i in range(len(weights))], - weights=weights, - combine_strategy="lora_delta_w_svd", - ) - - return - - # average adapters - svd_rank = 16 - model.average_adapter( - "averaged_adapter", - [f"{name}_{i}" for i in range(len(weights))], - weights=weights, - combine_strategy="lora_delta_w_svd", - svd_rank=svd_rank, - ) - - # adapter is correctly added to config - self.assertTrue("averaged_adapter" in model.adapters_config) - config = model.adapters_config.get("averaged_adapter") - self.assertEqual(LoRAConfig(dropout=config.dropout, init_weights=config.init_weights, r=svd_rank), config) - - # Calculate the new weights: Matrix A and B are SVD of all the weighted delta_w matrices of the adapters. - for i, layer in model.iter_layers(): - for module in layer.modules(): - if isinstance(module, LoRALayer): - # Check if this layer has the LoRA adapters - if not ( - f"{name}_0" in module.loras - and f"{name}_1" in module.loras - and f"{name}_2" in module.loras - and name in module.loras - ): - continue - - # Calculate the new weights - delta_w_1 = module.loras[name + "_0"].delta_w - delta_w_2 = module.loras[name + "_1"].delta_w - delta_w_3 = module.loras[name + "_2"].delta_w - delta_w = weights[0] * delta_w_1 + weights[1] * delta_w_2 + weights[2] * delta_w_3 - - self._check_svd_weights(delta_w, module.loras["averaged_adapter"], svd_rank) - - def test_edge_case_average_adapters_single_adapter(self): - # If we merge only one adapter, the weights of the new adapter should be the same as the original adapter + def test_average_lora(self): model = self.get_model() - model.eval() - model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd - - # add adapters to average - name = "test_adapter_" + LoRAConfig().__class__.__name__ - for i in range(3): - model.add_adapter( - f"{name}_{i}", - config=LoRAConfig( - dropout=random.random(), - init_weights=["bert", "lora"][i % 2], - ), - ) - - # collect weights of the first adapter so we can compare them to the newly created adapters in the subsequent tests - filter_keys_adapter_0 = [k.format(name=f"{name}_0") for k in ["loras.{name}."]] - adapter_0 = self.filter_parameters(model, filter_keys_adapter_0) - - # Run tests for every combine strategy - for combine_strategy in ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"]: - if not model_supports_lora_delta_w_svd and combine_strategy == "lora_delta_w_svd": - continue - - with self.subTest(combine_strategy=combine_strategy): - svd_rank = LoRAConfig().r if combine_strategy == "lora_delta_w_svd" else None - model.average_adapter( - adapter_name=f"{combine_strategy}_merged", - adapter_list=[f"{name}_0"], - weights=[1], - combine_strategy=combine_strategy, - svd_rank=svd_rank, - ) - - filter_keys = [k.format(name=f"{combine_strategy}_merged") for k in ["loras.{name}."]] - - if combine_strategy != "lora_delta_w_svd": - for k, v in self.filter_parameters(model, filter_keys).items(): - adapter_0_key = k.replace(f"{combine_strategy}_merged", f"{name}_0") - self.assertTrue(torch.allclose(v, adapter_0[adapter_0_key])) - else: - # For lora_delta_w_svd, we need to calculate the expected weights since lora_delta_w_svd performs an SVD - for i, layer in model.iter_layers(): - for module in layer.modules(): - if isinstance(module, LoRALayer): - if f"{name}_0" in module.loras and f"{combine_strategy}_merged" in module.loras: - original_lora = module.loras[f"{name}_0"] - merged_lora = module.loras[f"{combine_strategy}_merged"] - self._check_svd_weights(original_lora.delta_w, merged_lora, svd_rank) - - def test_edge_case_average_adapters_multiple_adapters(self): - # If we merge multiple adapters with weight 0 except one adapter with weight 1, the resulting adapter should be the same as the adapter with weight 1 - model = self.get_model() - model.eval() - model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd - - # add adapters to average - name = "test_adapter_" + LoRAConfig().__class__.__name__ - for i in range(3): - model.add_adapter( - f"{name}_{i}", - config=LoRAConfig( - dropout=random.random(), - init_weights=["bert", "lora"][i % 2], - ), - ) - - # collect weights of the first adapter so we can compare them to the newly created adapters in the subsequent tests - filter_keys_adapter_0 = [k.format(name=f"{name}_0") for k in ["loras.{name}."]] - adapter_0 = self.filter_parameters(model, filter_keys_adapter_0) - - # Run tests for every combine strategy - for combine_strategy in ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"]: - if not model_supports_lora_delta_w_svd and combine_strategy == "lora_delta_w_svd": - continue - - with self.subTest(combine_strategy=combine_strategy): - svd_rank = LoRAConfig().r if combine_strategy == "lora_delta_w_svd" else None - - # since normalize_weights is True, this should result in only the first adapter being used with a weight of 1 - model.average_adapter( - adapter_name=f"{combine_strategy}_merged", - adapter_list=[f"{name}_0", f"{name}_1", f"{name}_2"], - weights=[0.5, 0, 0], - combine_strategy=combine_strategy, - svd_rank=svd_rank, - ) - - filter_keys = [k.format(name=f"{combine_strategy}_merged") for k in ["loras.{name}."]] - - if combine_strategy != "lora_delta_w_svd": - for k, v in self.filter_parameters(model, filter_keys).items(): - adapter_1_key = k.replace(f"{combine_strategy}_merged", f"{name}_0") - self.assertTrue(torch.allclose(v, adapter_0[adapter_1_key])) - else: - # For lora_delta_w_svd, we need to calculate the expected weights since lora_delta_w_svd performs an SVD - for i, layer in model.iter_layers(): - for module in layer.modules(): - if isinstance(module, LoRALayer): - if f"{name}_0" in module.loras and f"{combine_strategy}_merged" in module.loras: - original_lora = module.loras[f"{name}_0"] - merged_lora = module.loras[f"{combine_strategy}_merged"] - self._check_svd_weights(original_lora.delta_w, merged_lora, svd_rank) + self.run_average_test(model, LoRAConfig(), ["loras.{name}."]) def test_delete_lora(self): model = self.get_model() diff --git a/tests/methods/test_prefix_tuning.py b/tests/methods/test_prefix_tuning.py index 2b351d0fc..a1c41268b 100644 --- a/tests/methods/test_prefix_tuning.py +++ b/tests/methods/test_prefix_tuning.py @@ -19,9 +19,9 @@ def test_leave_out_prefix_tuning(self): model = self.get_model() self.run_leave_out_test(model, PrefixTuningConfig(flat=True), self.leave_out_layers) - def test_linear_average_prefix_tuning(self): + def test_average_prefix_tuning(self): model = self.get_model() - self.run_linear_average_test(model, PrefixTuningConfig(flat=True), ["prefix_tunings.{name}."]) + self.run_average_test(model, PrefixTuningConfig(flat=True), ["prefix_tunings.{name}."]) def test_delete_prefix_tuning(self): model = self.get_model() @@ -62,7 +62,7 @@ def test_eject_prefix(self): input_data = self.get_input_samples(config=model.config) # user reparamterized prefix - model.set_active_adapters("test_prefix") + model.set_active_adapters(["test_prefix"]) output_1 = model(**input_data) # eject prefix diff --git a/tests/methods/test_prompt_tuning.py b/tests/methods/test_prompt_tuning.py index 97015d131..a5150e1aa 100644 --- a/tests/methods/test_prompt_tuning.py +++ b/tests/methods/test_prompt_tuning.py @@ -10,9 +10,9 @@ def test_add_prompt_tuning(self): model = self.get_model() self.run_add_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."]) - def test_linear_average_prompt_tuning(self): + def test_average_prompt_tuning(self): model = self.get_model() - self.run_linear_average_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."]) + self.run_average_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."]) def test_delete_prompt_tuning(self): model = self.get_model() diff --git a/tests/methods/test_reft.py b/tests/methods/test_reft.py index 884922180..5089d4ce1 100644 --- a/tests/methods/test_reft.py +++ b/tests/methods/test_reft.py @@ -29,7 +29,7 @@ def test_layers_reft(self): adapter_config = adapter_config.replace(layers=layers) name = "test_adapter_" + adapter_config.__class__.__name__ model.add_adapter(name, config=adapter_config) - model.set_active_adapters(name) + model.set_active_adapters([name]) # adapter is correctly added to config self.assert_adapter_available(model, name) @@ -47,7 +47,7 @@ def test_average_reft(self): model = self.get_model() for adapter_config, filter_keys in self.reft_configs_to_test: with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__): - self.run_linear_average_test(model, adapter_config, filter_keys) + self.run_average_test(model, adapter_config, filter_keys) def test_delete_reft(self): model = self.get_model() diff --git a/tests/methods/test_unipelt.py b/tests/methods/test_unipelt.py index d29fa5f18..83bbec522 100644 --- a/tests/methods/test_unipelt.py +++ b/tests/methods/test_unipelt.py @@ -10,11 +10,9 @@ def test_add_unipelt(self): model = self.get_model() self.run_add_test(model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."]) - def test_linear_average_unipelt(self): + def test_average_unipelt(self): model = self.get_model() - self.run_linear_average_test( - model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."] - ) + self.run_average_test(model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."]) def test_delete_unipelt(self): model = self.get_model() @@ -53,7 +51,7 @@ def test_output_adapter_gating_scores_unipelt(self): input_data = self.get_input_samples(config=model.config) - model.set_active_adapters(name) + model.set_active_adapters([name]) output_1 = model(**input_data, output_adapter_gating_scores=True) self.assertEqual(len(output_1[0]), self.default_input_samples_shape[0]) diff --git a/tests/models/test_plbart.py b/tests/models/test_plbart.py deleted file mode 100644 index 7fbbfc38d..000000000 --- a/tests/models/test_plbart.py +++ /dev/null @@ -1,12 +0,0 @@ -# flake8: noqa: F403,F405 -from adapters import PLBartAdapterModel -from hf_transformers.tests.models.plbart.test_modeling_plbart import * -from transformers.testing_utils import require_torch - -from .base import AdapterModelTesterMixin - - -@require_torch -class PLBartAdapterModelTest(AdapterModelTesterMixin, PLBartModelTest): - all_model_classes = (PLBartAdapterModel,) - fx_compatible = False diff --git a/tests/test_adapter_backward_compability.py b/tests/test_adapter_backward_compability.py index 6ec2ef214..03c04c792 100644 --- a/tests/test_adapter_backward_compability.py +++ b/tests/test_adapter_backward_compability.py @@ -14,7 +14,7 @@ def test_load_old_non_linearity(self): config = SeqBnConfig(non_linearity="gelu") name = "dummy" model1.add_adapter(name, config=config) - model1.set_active_adapters(name) + model1.set_active_adapters([name]) with tempfile.TemporaryDirectory() as temp_dir: model1.save_adapter(temp_dir, name) @@ -39,10 +39,10 @@ def test_save_version_with_adapter(self): config = SeqBnConfig(non_linearity="gelu") name = "dummy" model.add_adapter(name, config=config) - model.set_active_adapters(name) + model.set_active_adapters([name]) with tempfile.TemporaryDirectory() as temp_dir: model.save_adapter(temp_dir, name) with open(os.path.join(temp_dir, "adapter_config.json"), "r") as file: data = json.load(file) - self.assertEqual(__version__, data["version"].replace("adapters.", "")) + self.assertEqual(__version__, data["version"]) diff --git a/tests/test_adapter_config.py b/tests/test_adapter_config.py index db57aeae2..2bce31c7e 100644 --- a/tests/test_adapter_config.py +++ b/tests/test_adapter_config.py @@ -18,6 +18,15 @@ @require_torch class AdapterConfigTest(unittest.TestCase): + def test_config_load(self): + download_kwargs = {"force_download": True} + # TODO still uses the old config names as only these are available on the Hub + for config_name in ["pfeiffer", "houlsby"]: + with self.subTest(config_name=config_name): + config = AdapterConfig.load(config_name, download_kwargs=download_kwargs, non_linearity="leakyrelu") + self.assertTrue(isinstance(config, AdapterConfig)) + self.assertEqual(config.non_linearity, "leakyrelu") + def test_config_immutable(self): def set_attr(config: AdapterConfig): config.non_linearity = "dummy" diff --git a/tests/test_adapter_fusion_common.py b/tests/test_adapter_fusion_common.py index ccc860f66..4ee25fa06 100644 --- a/tests/test_adapter_fusion_common.py +++ b/tests/test_adapter_fusion_common.py @@ -38,7 +38,7 @@ def test_add_adapter_fusion(self): # check forward pass input_data = self.get_input_samples(config=model.config) - model.set_active_adapters(Fuse(name1, name2)) + model.set_active_adapters([[name1, name2]]) model.to(torch_device) adapter_output = model(**input_data) model.set_active_adapters(None) @@ -93,7 +93,7 @@ def test_load_adapter_fusion(self): model2.eval() model1.add_adapter_fusion([name1, name2], adater_fusion_config_name) - model1.set_active_adapters(Fuse(name1, name2)) + model1.set_active_adapters([[name1, name2]]) with tempfile.TemporaryDirectory() as temp_dir: model1.save_adapter_fusion(temp_dir, ",".join([name1, name2])) @@ -136,8 +136,8 @@ def test_load_full_model_fusion(self): # check equal output input_data = self.get_input_samples(config=model1.config) - model1.set_active_adapters(Fuse(name1, name2)) - model2.set_active_adapters(Fuse(name1, name2)) + model1.set_active_adapters([[name1, name2]]) + model2.set_active_adapters([[name1, name2]]) model1.to(torch_device) model2.to(torch_device) output1 = model1(**input_data) diff --git a/tests/test_adapter_heads.py b/tests/test_adapter_heads.py index 0de9134c0..af1749a94 100644 --- a/tests/test_adapter_heads.py +++ b/tests/test_adapter_heads.py @@ -6,7 +6,6 @@ import adapters from adapters import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoAdapterModel from adapters.composition import BatchSplit, Stack -from adapters.heads import PredictionHead from transformers import AutoModelForSequenceClassification from transformers.testing_utils import require_torch, torch_device @@ -456,95 +455,3 @@ def test_save_all_adapters_with_head(self): with tempfile.TemporaryDirectory() as tmp_dir: model.save_all_adapters(tmp_dir, with_head=False) self.assertFalse(os.path.isfile(os.path.join(tmp_dir, "test", "head_config.json"))) - - def test_average_head(self): - # Test the average_head method - model = AutoAdapterModel.from_config(self.config()) - model.eval() - - # Add adapters (this is just to see if the method also works if some heads are associated with an adapter while others are not) - for i in range(2): - model.add_adapter(f"adapter_{i}") - - # Add heads - for i in range(3): - self.add_head(model, f"adapter_{i}") - - # Calculate the expected weights of the new head - weights = [0.75, 0.25, -0.25] - expected_new_head_weights = {} - - for i, weight in enumerate(weights): - current_head: PredictionHead = model.heads[f"adapter_{i}"] - for k, v in current_head.named_parameters(): - base_k = k.replace(f"adapter_{i}", "new_head") - if base_k not in expected_new_head_weights: - expected_new_head_weights[base_k] = weight * v - else: - expected_new_head_weights[base_k] += weight * v - - # Average the heads - model.average_head( - head_name="new_head", - head_list=["adapter_0", "adapter_1", "adapter_2"], - weights=weights, - normalize_weights=False, - ) - - # Check that the new head was added - self.assertIn("new_head", model.heads) - - # Now, check that the actual weights are the same as the expected weights. - # Problem: Some heads might have tied weights. These weights therefore are the same as the embedding weights and are NOT the same as the expected weights dictionary. - - # 1. Identify if a layer has tied weights - head1 = model.heads["adapter_0"] - tied_weight_keys = set() - if head1.get_output_embeddings() and model.config.tie_word_embeddings: - output_embeddings = head1.get_output_embeddings() - - # Depending on the head, the tied layer has a different number: Find the layer number of the output embeddings - for name, module in head1.named_modules(): - if module is output_embeddings: - layer_prefix = name + "." - break - - for k, _ in output_embeddings.named_parameters(): - tied_weight_keys.add(f"{layer_prefix}{k}") - - print(f"tied_weight_keys: {tied_weight_keys}") - - # 2. Compare the weights of the new head with the expected weights - for k, v in model.heads["new_head"].named_parameters(): - if k not in tied_weight_keys: - self.assertTrue(torch.allclose(v, expected_new_head_weights[k]), k) - - # 3. Last check: Ensure that tied weights are actually tied - if model.config.tie_word_embeddings: - input_embeddings = model.get_input_embeddings() - output_embeddings = model.heads["new_head"].get_output_embeddings() - if output_embeddings is not None: - self.assertTrue( - torch.allclose(input_embeddings.weight, output_embeddings.weight), - "Input and output embeddings are not properly tied", - ) - - def test_tied_head_weights(self): - # Some heads tie the weights of the last layer to the input embeddings. This test checks that these weights are not trained, except when setting train_embeddings=True - model = AutoAdapterModel.from_config(self.config()) - model.eval() - - # Check if model has add_masked_lm_head method - if "masked_lm" not in ADAPTER_MODEL_MAPPING[self.config_class].head_types: - self.skipTest("Model does not have masked language model head, skip test") - - model.add_adapter("mlm") - model.add_masked_lm_head("mlm") - - # 1. No training of embeddings => weights should not change - model.train_adapter("mlm") - self.assertFalse(model.heads["mlm"].get_output_embeddings().weight.requires_grad) - - # 2. Training of embeddings => weights should change - model.train_adapter("mlm", train_embeddings=True) - self.assertTrue(model.heads["mlm"].get_output_embeddings().weight.requires_grad) diff --git a/tests/test_adapter_hub.py b/tests/test_adapter_hub.py index fa29d13b1..267ab06d4 100644 --- a/tests/test_adapter_hub.py +++ b/tests/test_adapter_hub.py @@ -84,7 +84,7 @@ def test_load_task_adapter_from_hub(self): args=training_args, eval_dataset=eval_dataset, compute_metrics=self._compute_glue_metrics("mrpc"), - adapter_names="mrpc", + adapter_names=["mrpc"], ) result = trainer.evaluate() self.assertGreater(result["eval_acc"], 0.9) diff --git a/tests/test_adapter_safetensors.py b/tests/test_adapter_safetensors.py index 3c743c7a9..ef80dd43d 100644 --- a/tests/test_adapter_safetensors.py +++ b/tests/test_adapter_safetensors.py @@ -43,7 +43,7 @@ def test_safetensors_adapter(self): name = "test_adapter" model1.add_adapter(name) model1.add_classification_head(name, num_labels=2) - model1.set_active_adapters(name) + model1.set_active_adapters([name]) temp_dir = tempfile.TemporaryDirectory() # Save & reload adapter @@ -58,7 +58,7 @@ def test_safetensors_adapter(self): self.assertEqual(0, len(loading_info["unexpected_keys"])) # check if adapter was correctly loaded self.assertTrue(name in model2.adapters_config) - model2.set_active_adapters(name) + model2.set_active_adapters([name]) # check equal output input_data = self.get_input_samples((2, 32)) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index d313b656e..7fc358705 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -143,7 +143,7 @@ def test_resume_training_with_fusion(self): model.add_adapter("additional_adapter") model.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model.set_active_adapters(Fuse("adapter", "additional_adapter")) - model.train_adapter_fusion(Fuse("adapter", "additional_adapter")) + model.train_fusion(Fuse("adapter", "additional_adapter")) training_args = TrainingArguments( output_dir=tmpdirname, @@ -167,7 +167,7 @@ def test_resume_training_with_fusion(self): model_resume.add_adapter("additional_adapter") model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model_resume.set_active_adapters(Fuse("adapter", "additional_adapter")) - model_resume.train_adapter_fusion(Fuse("adapter", "additional_adapter")) + model_resume.train_fusion(Fuse("adapter", "additional_adapter")) trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir=tmpdirname), diff --git a/tests/test_mistral.py b/tests/test_mistral.py index b10065a70..3e5d970d4 100644 --- a/tests/test_mistral.py +++ b/tests/test_mistral.py @@ -3,6 +3,8 @@ from transformers.models.mistral.configuration_mistral import MistralConfig from transformers.testing_utils import require_torch +from transformers.testing_utils import require_torch + from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin from .methods import ( BottleneckAdapterTestMixin, @@ -27,13 +29,12 @@ class MistralAdapterTestBase(AdapterTestBase): MistralConfig, hidden_size=32, num_hidden_layers=5, - num_attention_heads=8, + num_attention_heads=4, intermediate_size=37, hidden_act="gelu", - hidden_dropout_prob=0.1, pad_token_id=0, ) - tokenizer_name = "HuggingFaceH4/zephyr-7b-beta" + tokenizer_name = "mistralai/Mistral-7B-v0.1" @require_torch @@ -63,4 +64,5 @@ class MistralClassConversionTest( MistralAdapterTestBase, unittest.TestCase, ): - pass + def test_conversion_question_answering_model(self): + raise self.skipTest("We don't support the Mistral QA model.") diff --git a/tests/test_plbart.py b/tests/test_plbart.py deleted file mode 100644 index aa8445791..000000000 --- a/tests/test_plbart.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest - -from tests.methods.test_config_union import ConfigUnionAdapterTest -from transformers import PLBartConfig -from transformers.testing_utils import require_torch - -from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin -from .methods import ( - BottleneckAdapterTestMixin, - CompacterTestMixin, - IA3TestMixin, - LoRATestMixin, - PrefixTuningTestMixin, - UniPELTTestMixin, -) -from .test_adapter import AdapterTestBase, make_config -from .test_adapter_backward_compability import CompabilityTestMixin -from .test_adapter_conversion import ModelClassConversionTestMixin -from .test_adapter_embeddings import EmbeddingTestMixin -from .test_adapter_fusion_common import AdapterFusionModelTestMixin -from .test_adapter_heads import PredictionHeadModelTestMixin - - -class PLBartAdapterTestBase(AdapterTestBase): - config_class = PLBartConfig - config = make_config( - PLBartConfig, - d_model=16, - encoder_layers=2, - decoder_layers=2, - encoder_attention_heads=4, - decoder_attention_heads=4, - encoder_ffn_dim=4, - decoder_ffn_dim=4, - scale_embedding=False, # Required for embedding tests - ) - tokenizer_name = "uclanlp/plbart-base" - - -@require_torch -class PLBartAdapterTest( - BottleneckAdapterTestMixin, - CompacterTestMixin, - IA3TestMixin, - LoRATestMixin, - PrefixTuningTestMixin, - UniPELTTestMixin, - AdapterFusionModelTestMixin, - CompabilityTestMixin, - EmbeddingTestMixin, - PredictionHeadModelTestMixin, - ParallelAdapterInferenceTestMixin, - ParallelTrainingMixin, - ConfigUnionAdapterTest, - PLBartAdapterTestBase, - unittest.TestCase, -): - pass - - -@require_torch -class PLBartClassConversionTest( - ModelClassConversionTestMixin, - PLBartAdapterTestBase, - unittest.TestCase, -): - pass diff --git a/utils/back_comp/README.md b/utils/back_comp/README.md deleted file mode 100644 index 14896c613..000000000 --- a/utils/back_comp/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Backwards Compatibility Tests - -## Motivation - -This directory contains a set of tests that can be run to ensure that newly introduced changes or refactorings do not break existing functionalities. These tests verify model output consistency between two branches; here, we use the names `dev` and `main` for demonstration purposes, but these tests can be performed between any two branches where the `back_comp` directory with tests is available. -For this, the test script performs a forward pass for each supported model and compares the outputs between `dev` and `main` to identify any differences. - -## Requirements - -To execute these tests, you must meet the following requirements: - -- Ability to run bash scripts (in-built on Linux/macOS; for Windows, consider using third-party software like [GNU Bash](https://www.gnu.org/software/bash/)). -- Git as the version control system to switch between branches. -- The ability to check out the desired branch. If the branch is from another fork, you may need to add the repository as a remote. Refer to [GitHub's instructions](https://docs.github.com/en/get-started/getting-started-with-git/managing-remote-repositories) for details. -- A Python virtual environment to modify the installed package version of `adapters`. - -## Procedure - -To perform the compatibility tests, follow these steps: - -1. Determine a directory where you want to save the model output generated by the tests. Save this directory path to the variable `SaveDir` in the shell script `compare.sh`. (Careful: select a directory OUTSIDE of the repository; otherwise, the saved model output is no longer available when changing the branch.) -2. Select the branch you want to compare with `main` and save its name to the variable `Branch`. -3. Make sure you are checked out in `main` before starting the test script. -4. In your command line, navigate to the `back_comp` directory and execute the script by running `sh compare.sh`. - -The results will be displayed in the command line for visualization. \ No newline at end of file diff --git a/utils/back_comp/Utils.py b/utils/back_comp/Utils 2.py similarity index 97% rename from utils/back_comp/Utils.py rename to utils/back_comp/Utils 2.py index 21c15545f..8ed482130 100644 --- a/utils/back_comp/Utils.py +++ b/utils/back_comp/Utils 2.py @@ -29,7 +29,6 @@ GPT2Config, GPTJConfig, MBartConfig, - PLBartConfig, RobertaConfig, T5Config, ViTConfig, @@ -131,7 +130,6 @@ def get_model_names(): "gpt2", "gptj", "mbart", - "plbart", "roberta", "t5", "vit", @@ -285,19 +283,6 @@ def create_model(model_name: str, model_class: Any) -> Any: ) model = model_class.from_config(mbart_config) - elif model_name == "plbart": - plbart_config = PLBartConfig( - d_model=16, - encoder_layers=2, - decoder_layers=2, - encoder_attention_heads=4, - decoder_attention_heads=4, - encoder_ffn_dim=4, - decoder_ffn_dim=4, - vocab_size=50005, - ) - model = model_class.from_config(plbart_config) - elif model_name == "roberta": roberta_config = RobertaConfig( hidden_size=32, diff --git a/utils/back_comp/compare_outputs.py b/utils/back_comp/compare_outputs.py deleted file mode 100644 index 0775bf1bd..000000000 --- a/utils/back_comp/compare_outputs.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse -import os - -from Utils import ( - compare_lists_close, - convert_tensors_to_list, - create_output, - fix_seeds, - get_model_names, - get_new_adapter_config_strings, - load_model, - restore_from_jsonl, -) - - -parser = argparse.ArgumentParser() -parser.add_argument("--path", type=str) -args = parser.parse_args() - - -# Create the root path -base_dir = os.path.join(args.path, "model_outputs") -fix_seeds() - -for model_name in get_model_names(): - # Load the reference model - print(f"Model = {model_name}") - model_dir = os.path.join(base_dir, model_name) - model = load_model(model_name, os.path.join(model_dir, "model_weights")) - - for adapter_config in get_new_adapter_config_strings(): - # Create a new model output - adapter_name = model.load_adapter(os.path.join(model_dir, "weights_" + adapter_config)) - model.set_active_adapters(adapter_name) - model_output = create_output(model, model_name) - - # Compare the model output to the reference output - model_output_n, last_hidden_state = convert_tensors_to_list(model_output) - ref_output = restore_from_jsonl(config=adapter_config, file_path=os.path.join(model_dir, "output.jsonl")) - is_equal = compare_lists_close(ref_output, model_output_n, rtol=1e-05, atol=1e-08) - print(f"Adapter: {adapter_config} -> {is_equal}") - - model.delete_adapter(adapter_name) diff --git a/utils/convert_xmod_checkpoint.py b/utils/convert_xmod_checkpoint.py index b3744fece..30ca0ede7 100644 --- a/utils/convert_xmod_checkpoint.py +++ b/utils/convert_xmod_checkpoint.py @@ -1,7 +1,6 @@ """ This script can be used to convert an Xmod checkpoints (including adapters) from the HF format to the Adapters format. """ - import argparse import os import re