diff --git a/src/adapters/__init__.py b/src/adapters/__init__ 2.py
similarity index 98%
rename from src/adapters/__init__.py
rename to src/adapters/__init__ 2.py
index 20d8eaf77..a10439da6 100644
--- a/src/adapters/__init__.py
+++ b/src/adapters/__init__ 2.py	
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.0.0.dev0"
+__version__ = "0.2.2"
 
 from typing import TYPE_CHECKING
 
@@ -111,7 +111,6 @@
     "models.mbart": ["MBartAdapterModel"],
     "models.mistral": ["MistralAdapterModel"],
     "models.mt5": ["MT5AdapterModel"],
-    "models.plbart": ["PLBartAdapterModel"],
     "models.roberta": ["RobertaAdapterModel"],
     "models.t5": ["T5AdapterModel"],
     "models.vit": ["ViTAdapterModel"],
@@ -217,10 +216,9 @@
     from .models.gpt2 import GPT2AdapterModel
     from .models.gptj import GPTJAdapterModel
     from .models.llama import LlamaAdapterModel
-    from .models.mbart import MBartAdapterModel
     from .models.mistral import MistralAdapterModel
+    from .models.mbart import MBartAdapterModel
     from .models.mt5 import MT5AdapterModel
-    from .models.plbart import PLBartAdapterModel
     from .models.roberta import RobertaAdapterModel
     from .models.t5 import T5AdapterModel
     from .models.vit import ViTAdapterModel
diff --git a/src/adapters/composition.py b/src/adapters/composition 2.py
similarity index 96%
rename from src/adapters/composition.py
rename to src/adapters/composition 2.py
index 62c2854ac..16761f400 100644
--- a/src/adapters/composition.py
+++ b/src/adapters/composition 2.py	
@@ -1,5 +1,4 @@
 import itertools
-import warnings
 from collections.abc import Sequence
 from typing import List, Optional, Set, Tuple, Union
 
@@ -92,7 +91,7 @@ def __init__(
         self,
         *average_adapters: List[Union[AdapterCompositionBlock, str]],
         weights: Optional[List[float]] = None,
-        normalize_weights: bool = True,
+        normalize_weights: bool = True
     ):
         super().__init__(*average_adapters)
         if weights is not None:
@@ -129,7 +128,6 @@ def __init__(
         "bart",
         "mbart",
         "mt5",
-        "plbart",
         "gpt2",
         "gptj",
         "t5",
@@ -155,7 +153,7 @@ def validate_composition(adapter_composition: AdapterCompositionBlock, level=0,
                     f"Models of type {model_type} don't support adapter composition using {block_type.__name__}."
                 )
         for child in adapter_composition:
-            if not type(child) in ALLOWED_NESTINGS[type(adapter_composition)]:
+            if type(child) not in ALLOWED_NESTINGS[type(adapter_composition)]:
                 raise ValueError(f"Adapter setup is invalid. Cannot nest {child} in {adapter_composition}")
             # recursively validate children
             validate_composition(child, level=level + 1)
@@ -181,11 +179,6 @@ def parse_composition(adapter_composition, level=0, model_type=None) -> AdapterC
         else:
             return adapter_composition
     elif isinstance(adapter_composition, Sequence):
-        # Functionality of adapter-transformers v1.x
-        warnings.warn(
-            "Passing list objects for adapter activation is deprecated. Please use Stack or Fuse explicitly.",
-            category=FutureWarning,
-        )
         # for backwards compatibility
         if level == 1:
             block_class = Fuse
diff --git a/src/adapters/configuration/adapter_fusion_config.py b/src/adapters/configuration/adapter_fusion_config.py
index 6dc31dab1..552bcdbe6 100644
--- a/src/adapters/configuration/adapter_fusion_config.py
+++ b/src/adapters/configuration/adapter_fusion_config.py
@@ -36,7 +36,7 @@ def load(cls, config: Union[dict, str], **kwargs):
             dict: The resolved adapter fusion configuration dictionary.
         """
         # currently storing AdapterFusion weights on AdapterHub is not supported.
-        config_dict = resolve_adapter_config(config, local_map=ADAPTERFUSION_CONFIG_MAP)
+        config_dict = resolve_adapter_config(config, local_map=ADAPTERFUSION_CONFIG_MAP, try_loading_from_hub=False)
         # convert back to dict to allow attr overrides
         if isinstance(config_dict, AdapterFusionConfig):
             config_dict = config_dict.to_dict()
diff --git a/src/adapters/configuration/model_adapters_config.py b/src/adapters/configuration/model_adapters_config.py
index 3ae7dcf56..3f4c3023d 100644
--- a/src/adapters/configuration/model_adapters_config.py
+++ b/src/adapters/configuration/model_adapters_config.py
@@ -237,6 +237,5 @@ def build_full_config(adapter_config, model_config, save_id2label=False, **kwarg
         config_dict["config"] = adapter_config.to_dict()
     else:
         config_dict["config"] = adapter_config
-    # add lib name before version to distinguish from adapter-transformers
-    config_dict["version"] = "adapters." + __version__
+    config_dict["version"] = __version__
     return config_dict
diff --git a/src/adapters/context.py b/src/adapters/context.py
deleted file mode 100644
index 70e685d03..000000000
--- a/src/adapters/context.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import functools
-import threading
-
-from .composition import parse_composition, parse_heads_from_composition
-
-
-class AdapterSetup:
-    """
-    Represents an adapter setup of a model including active adapters and active heads. This class is intended to be
-    used as a context manager using the ``with`` statement. The setup defined by the ``AdapterSetup`` context will
-    override static adapter setups defined in a model (i.e. setups specified via ``active_adapters``).
-
-    Example::
-
-        with AdapterSetup(Stack("a", "b")):
-            # will use the adapter stack "a" and "b" outputs = model(**inputs)
-
-    Note that the context manager is thread-local, i.e. it can be used with different setups in a multi-threaded
-    environment.
-    """
-
-    # thread-local storage that holds a stack of active contexts
-    storage = threading.local()
-
-    def __init__(self, adapter_setup, head_setup=None, ignore_empty: bool = False):
-        self.adapter_setup = parse_composition(adapter_setup)
-        if head_setup:
-            self.head_setup = head_setup
-        else:
-            self.head_setup = parse_heads_from_composition(self.adapter_setup)
-        self._empty = ignore_empty and self.adapter_setup is None and self.head_setup is None
-
-    def __enter__(self):
-        if not self._empty:
-            AdapterSetup.get_contexts().append(self)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        if not self._empty:
-            AdapterSetup.get_contexts().pop()
-
-    @classmethod
-    def get_contexts(cls):
-        if not hasattr(cls.storage, "contexts"):
-            cls.storage.contexts = []
-        return cls.storage.contexts
-
-    @classmethod
-    def get_context(cls):
-        try:
-            return cls.get_contexts()[-1]
-        except IndexError:
-            return None
-
-    @classmethod
-    def get_context_adapter_setup(cls):
-        context = cls.get_context()
-        if context:
-            return context.adapter_setup
-        return None
-
-    @classmethod
-    def get_context_head_setup(cls):
-        context = cls.get_context()
-        if context:
-            return context.head_setup
-        return None
-
-
-class ForwardContext:
-    """
-    Holds context information during a forward pass through a model. This class should be used via the
-    ``ForwardContext.wrap()`` method.
-
-    Note that the context is thread-local.
-    """
-
-    # thread-local storage that holds a stack of active contexts
-    storage = threading.local()
-
-    context_attributes = [
-        "adapter_gating_scores",
-        "adapter_fusion_attentions",
-        "adapter_input_parallelized",
-    ]
-    # Additional used attributes not exposed to the user
-    # - prompt_tokens_length: length of the prompt tokens
-
-    def __init__(self, model, *args, **kwargs):
-        # If the model has a method ``forward_context()``, use it to create the context.
-        if hasattr(model, "forward_context"):
-            model.forward_context(self, *args, **kwargs)
-
-    def __enter__(self):
-        ForwardContext.get_contexts().append(self)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        ForwardContext.get_contexts().pop()
-
-    @classmethod
-    def wrap(cls, f):
-        """
-        Decorator method that wraps a ``forward()`` function of a model class.
-        """
-
-        @functools.wraps(f)
-        def wrapper_func(self, *args, **kwargs):
-            if self.adapters_config is not None:
-                with cls(self, *args, **kwargs) as ctx:
-                    # whether to output the context attributes
-                    output_context = kwargs.pop("output_context", False)
-                    kwargs = {
-                        k: v for k, v in kwargs.items() if k.replace("output_", "") not in cls.context_attributes
-                    }
-                    results = f(self, *args, **kwargs)
-
-                    # append output attributes
-                    if isinstance(results, tuple):
-                        for attr in cls.context_attributes:
-                            if getattr(ctx, "output_" + attr, False):
-                                results = results + (dict(getattr(ctx, attr)),)
-                    else:
-                        for attr in cls.context_attributes:
-                            if getattr(ctx, "output_" + attr, False):
-                                results[attr] = dict(getattr(ctx, attr))
-
-                    if output_context:
-                        context_dict = ctx.__dict__
-
-                if output_context:
-                    return results, context_dict
-                else:
-                    return results
-            else:
-                return f(self, *args, **kwargs)
-
-        return wrapper_func
-
-    @classmethod
-    def get_contexts(cls):
-        if not hasattr(cls.storage, "contexts"):
-            cls.storage.contexts = []
-        return cls.storage.contexts
-
-    @classmethod
-    def get_context(cls):
-        try:
-            return cls.get_contexts()[-1]
-        except IndexError:
-            return None
diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils 2.py
similarity index 95%
rename from src/adapters/head_utils.py
rename to src/adapters/head_utils 2.py
index 8226d1ed6..60b2ce52b 100644
--- a/src/adapters/head_utils.py
+++ b/src/adapters/head_utils 2.py	
@@ -369,27 +369,6 @@
         },
         "layers": ["lm_head"],
     },
-    # PLBART
-    "PLBartForSequenceClassification": {
-        "config": {
-            "head_type": "classification",
-            "layers": 2,
-            "activation_function": "tanh",
-        },
-        "layers": [
-            None,
-            "classification_head.dense",
-            None,
-            None,
-            "classification_head.out_proj",
-        ],
-    },
-    "PLBartForConditionalGeneration": {
-        "config": {
-            "head_type": "seq2seq_lm",
-        },
-        "layers": ["lm_head"],
-    },
     # MT5
     "MT5ForConditionalGeneration": {
         "config": {
@@ -673,15 +652,7 @@
         },
         "layers": [None, "qa_outputs"],
     },
-    "LlamaForTokenClassification": {
-        "config": {
-            "head_type": "tagging",
-            "layers": 1,
-            "activation_function": None,
-        },
-        "layers": [None, "score"],
-    },
-    # Mistral
+    #Mistral
     "MistralForSequenceClassification": {
         "config": {
             "head_type": "classification",
@@ -698,14 +669,6 @@
         },
         "layers": ["lm_head"],
     },
-    "MistralForTokenClassification": {
-        "config": {
-            "head_type": "tagging",
-            "layers": 1,
-            "activation_function": None,
-        },
-        "layers": [None, "score"],
-    },
     # Electra
     "ElectraForTokenClassification": {
         "config": {
diff --git a/src/adapters/heads/model_mixin.py b/src/adapters/heads 2/model_mixin 2.py
similarity index 99%
rename from src/adapters/heads/model_mixin.py
rename to src/adapters/heads 2/model_mixin 2.py
index 9a27bbd76..bc197ddbf 100644
--- a/src/adapters/heads/model_mixin.py
+++ b/src/adapters/heads 2/model_mixin 2.py	
@@ -134,8 +134,6 @@ def tie_weights(self):
                 self = getattr(self, self.base_model_prefix)
             self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
 
-        super().tie_weights()
-
     def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         old_embeddings = self.get_input_embeddings()
         new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
@@ -527,7 +525,7 @@ def forward_head(
         attention_mask=None,
         return_dict=False,
         context=None,
-        **kwargs,
+        **kwargs
     ):
         """
         The forward pass through a prediction head configuration. There are three ways to specify the used prediction
diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py
deleted file mode 100644
index 82dd8097a..000000000
--- a/src/adapters/heads/base.py
+++ /dev/null
@@ -1,521 +0,0 @@
-import logging
-from dataclasses import dataclass
-from typing import List, Optional
-
-import torch
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.modeling_outputs import (
-    ImageClassifierOutput,
-    MultipleChoiceModelOutput,
-    QuestionAnsweringModelOutput,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-)
-from transformers.utils import ModelOutput
-
-from ..composition import adjust_tensors_for_parallel
-from ..methods.modeling import Activation_Function_Class
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class MultiHeadOutput(ModelOutput):
-    head_outputs: List[ModelOutput] = None
-    loss: Optional[torch.FloatTensor] = None
-
-    @property
-    def logits(self):
-        return torch.vstack([outputs["logits"] for outputs in self.head_outputs])
-
-    def __getitem__(self, k):
-        # with number indices the head output at that position is accessed
-        # e.g output[1] is equivalent to output.head_outputs[1]
-        if isinstance(k, int):
-            return self.head_outputs[k]
-        # with strings the attribute in the underlying dict can be adressed
-        # e.g output["loss"] is equivalent to output.loss
-        else:
-            return super().__getitem__(k)
-
-    def __setitem__(self, k, v):
-        if isinstance(k, int):
-            self.head_outputs[k] = v
-        else:
-            return super().__setitem__(k, v)
-
-    def __iter__(self):
-        # iterates over the head outputs
-        return iter(self.head_outputs)
-
-    def __len__(self):
-        return len(self.head_outputs)
-
-
-# Let this class inherit from nn.Sequential to provide iterable access as before
-class PredictionHead(nn.Sequential):
-    def __init__(self, name):
-        super().__init__()
-        self.config = {}
-        self.name = name
-
-    def _get_dropout_prob(self, model_config):
-        # try to infer dropout prob from various sources, default to 0.0
-        if "dropout_prob" in self.config and self.config["dropout_prob"] is not None:
-            dropout_prob = self.config["dropout_prob"]
-        elif hasattr(model_config, "classifier_dropout") and model_config.classifier_dropout is not None:
-            dropout_prob = model_config.classifier_dropout
-        elif hasattr(model_config, "hidden_dropout_prob") and model_config.hidden_dropout_prob is not None:
-            dropout_prob = model_config.hidden_dropout_prob
-        else:
-            dropout_prob = 0.0
-
-        return dropout_prob
-
-    def build(self, model):
-        model_config = model.config
-        pred_head = []
-        dropout_prob = self._get_dropout_prob(model_config)
-        bias = self.config.get("bias", True)
-        for l_id in range(self.config["layers"]):
-            pred_head.append(nn.Dropout(dropout_prob))
-            if l_id < self.config["layers"] - 1:
-                pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size))
-                if self.config["activation_function"]:
-                    pred_head.append(Activation_Function_Class(self.config["activation_function"]))
-            else:
-                if "num_labels" in self.config:
-                    pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"], bias=bias))
-                elif "num_choices" in self.config:  # used for multiple_choice head
-                    pred_head.append(nn.Linear(model_config.hidden_size, 1, bias=bias))
-                else:
-                    pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias))
-                    if self.config["activation_function"]:
-                        pred_head.append(Activation_Function_Class(self.config["activation_function"]))
-        for i, module in enumerate(pred_head):
-            self.add_module(str(i), module)
-
-        # We need to import the current value of _init_weights at each execution to determine if weights init is disabled.
-        from transformers.modeling_utils import _init_weights
-
-        if _init_weights:
-            self.apply(model._init_weights)
-        self.train(model.training)  # make sure training mode is consistent
-
-    def get_output_embeddings(self):
-        return None  # override for heads with output embeddings
-
-    def get_label_names(self):
-        return ["labels"]
-
-    def _get_cls_output(self, outputs, **kwargs):
-        if self.config["use_pooler"]:
-            cls_output = kwargs.pop("pooled_output")
-        elif kwargs.get("get_cls_from_eos_tokens", False):
-            x = outputs[0]  # last hidden state
-            eos_mask = kwargs.get("eos_mask")
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_output = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_output = outputs[0][:, 0]
-
-        return cls_output
-
-
-class ClassificationHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_labels=2,
-        layers=2,
-        activation_function="tanh",
-        id2label=None,
-        use_pooler=False,
-        bias=True,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "classification",
-            "num_labels": num_labels,
-            "layers": layers,
-            "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "use_pooler": use_pooler,
-            "bias": bias,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        if cls_output is None:
-            cls_output = self._get_cls_output(outputs, **kwargs)
-        logits = super().forward(cls_output)
-        loss = None
-        labels = kwargs.pop("labels", None)
-        if labels is not None:
-            if self.config["num_labels"] == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1))
-
-        if return_dict:
-            if isinstance(outputs, Seq2SeqModelOutput):
-                return Seq2SeqSequenceClassifierOutput(
-                    loss=loss,
-                    logits=logits,
-                    past_key_values=outputs.past_key_values,
-                    decoder_hidden_states=outputs.decoder_hidden_states,
-                    decoder_attentions=outputs.decoder_attentions,
-                    cross_attentions=outputs.cross_attentions,
-                    encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-                    encoder_hidden_states=outputs.encoder_hidden_states,
-                    encoder_attentions=outputs.encoder_attentions,
-                )
-            else:
-                return SequenceClassifierOutput(
-                    loss=loss,
-                    logits=logits,
-                    hidden_states=outputs.hidden_states,
-                    attentions=outputs.attentions,
-                )
-        else:
-            outputs = (logits,) + outputs[1:]
-            if labels is not None:
-                outputs = (loss,) + outputs
-            return outputs
-
-
-class MultiLabelClassificationHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_labels=2,
-        layers=2,
-        activation_function="tanh",
-        id2label=None,
-        use_pooler=False,
-        bias=True,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "multilabel_classification",
-            "num_labels": num_labels,
-            "layers": layers,
-            "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "use_pooler": use_pooler,
-            "bias": bias,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        if cls_output is None:
-            cls_output = self._get_cls_output(outputs, **kwargs)
-        logits = super().forward(cls_output)
-        loss = None
-        labels = kwargs.pop("labels", None)
-        if labels is not None:
-            loss_fct = BCEWithLogitsLoss()
-            if labels.dtype != torch.float32:
-                labels = labels.float()
-            loss = loss_fct(logits, labels)
-
-        if return_dict:
-            if isinstance(outputs, Seq2SeqModelOutput):
-                return Seq2SeqSequenceClassifierOutput(
-                    loss=loss,
-                    logits=logits,
-                    past_key_values=outputs.past_key_values,
-                    decoder_hidden_states=outputs.decoder_hidden_states,
-                    decoder_attentions=outputs.decoder_attentions,
-                    cross_attentions=outputs.cross_attentions,
-                    encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-                    encoder_hidden_states=outputs.encoder_hidden_states,
-                    encoder_attentions=outputs.encoder_attentions,
-                )
-            else:
-                return SequenceClassifierOutput(
-                    loss=loss,
-                    logits=logits,
-                    hidden_states=outputs.hidden_states,
-                    attentions=outputs.attentions,
-                )
-        else:
-            outputs = (logits,) + outputs[1:]
-            if labels is not None:
-                outputs = (loss,) + outputs
-            return outputs
-
-
-class MultipleChoiceHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_choices=2,
-        layers=2,
-        activation_function="tanh",
-        id2label=None,
-        use_pooler=False,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "multiple_choice",
-            "num_choices": num_choices,
-            "layers": layers,
-            "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "use_pooler": use_pooler,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=None, **kwargs):
-        if cls_output is None:
-            cls_output = self._get_cls_output(outputs, **kwargs)
-        logits = super().forward(cls_output)
-        logits = logits.view(-1, self.config["num_choices"])
-        loss = None
-        labels = kwargs.pop("labels", None)
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits, labels)
-
-        if return_dict:
-            return MultipleChoiceModelOutput(
-                loss=loss,
-                logits=logits,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-        else:
-            outputs = (logits,) + outputs[1:]
-            if labels is not None:
-                outputs = (loss,) + outputs
-            return outputs
-
-
-class TaggingHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_labels=2,
-        layers=1,
-        activation_function="tanh",
-        id2label=None,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "tagging",
-            "num_labels": num_labels,
-            "layers": layers,
-            "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        logits = super().forward(outputs[0])
-        loss = None
-
-        labels = kwargs.pop("labels", None)
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # adjust labels for prompt tuning
-            if kwargs.get("prompt_tokens_length", 0) > 0:
-                prompt_length = kwargs.get("prompt_tokens_length")
-                prompt_labels = torch.full(
-                    (labels.shape[0], prompt_length), loss_fct.ignore_index, dtype=torch.long, device=labels.device
-                )
-                labels = torch.cat((prompt_labels, labels), dim=-1)
-                if attention_mask is not None:
-                    attention_mask = torch.cat(
-                        (torch.ones_like(prompt_labels, dtype=torch.long, device=labels.device), attention_mask),
-                        dim=-1,
-                    )
-
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.config["num_labels"])
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1))
-
-        if return_dict:
-            return TokenClassifierOutput(
-                loss=loss,
-                logits=logits,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-        else:
-            outputs = (logits,) + outputs[1:]
-            if labels is not None:
-                outputs = (loss,) + outputs
-            return outputs
-
-
-class QuestionAnsweringHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_labels=2,
-        layers=1,
-        activation_function="tanh",
-        id2label=None,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "question_answering",
-            "num_labels": num_labels,
-            "layers": layers,
-            "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        sequence_output = outputs[0]
-        logits = super().forward(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        start_positions = kwargs.pop("start_positions", None)
-        end_positions = kwargs.pop("end_positions", None)
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if return_dict:
-            if isinstance(outputs, Seq2SeqModelOutput):
-                return Seq2SeqQuestionAnsweringModelOutput(
-                    loss=total_loss,
-                    start_logits=start_logits,
-                    end_logits=end_logits,
-                    past_key_values=outputs.past_key_values,
-                    decoder_hidden_states=outputs.decoder_hidden_states,
-                    decoder_attentions=outputs.decoder_attentions,
-                    cross_attentions=outputs.cross_attentions,
-                    encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-                    encoder_hidden_states=outputs.encoder_hidden_states,
-                    encoder_attentions=outputs.encoder_attentions,
-                )
-            else:
-                return QuestionAnsweringModelOutput(
-                    loss=total_loss,
-                    start_logits=start_logits,
-                    end_logits=end_logits,
-                    hidden_states=outputs.hidden_states,
-                    attentions=outputs.attentions,
-                )
-        else:
-            outputs = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            if total_loss is not None:
-                outputs = (total_loss,) + outputs
-            return outputs
-
-    def get_label_names(self):
-        return ["start_positions", "end_positions"]
-
-
-class ImageClassificationHead(PredictionHead):
-    def __init__(
-        self,
-        model,
-        head_name,
-        num_labels=2,
-        layers=2,
-        activation_function="tanh",
-        multilabel=False,
-        id2label=None,
-        use_pooler=False,
-        bias=True,
-        dropout_prob=None,
-    ):
-        super().__init__(head_name)
-        self.config = {
-            "head_type": "image_classification",
-            "num_labels": num_labels,
-            "layers": layers,
-            "activation_function": activation_function,
-            "multilabel": multilabel,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
-            "use_pooler": use_pooler,
-            "bias": bias,
-            "dropout_prob": dropout_prob,
-        }
-        self.build(model)
-
-    def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        if cls_output is None:
-            cls_output = self._get_cls_output(outputs, **kwargs)
-        logits = super().forward(cls_output)
-        loss = None
-        labels = kwargs.pop("labels", None)
-        if labels is not None:
-            if self.config["num_labels"] == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            elif self.config["multilabel"]:
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config["num_labels"]), labels.view(-1))
-
-        if return_dict:
-            return ImageClassifierOutput(
-                loss=loss,
-                logits=logits,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-        else:
-            outputs = (logits,) + outputs[1:]
-            if labels is not None:
-                outputs = (loss,) + outputs
-            return outputs
diff --git a/src/adapters/heads/dependency_parsing.py b/src/adapters/heads/dependency_parsing.py
index d2cbf98c0..d568f356b 100644
--- a/src/adapters/heads/dependency_parsing.py
+++ b/src/adapters/heads/dependency_parsing.py
@@ -2,7 +2,6 @@
 Code taken and modified from: https://github.com/Adapter-Hub/hgiyt. Credits: "How Good is Your Tokenizer? On the
 Monolingual Performance of Multilingual Language Models" (Rust et al., 2021) https://arxiv.org/abs/2012.15613
 """
-
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -97,7 +96,7 @@ def forward(
         word_starts=None,
         labels_arcs=None,
         labels_rels=None,
-        **kwargs,
+        **kwargs
     ):
         outs = self.dropout(outputs[0])
         word_outputs_deps = self._merge_subword_tokens(outs, word_starts)
diff --git a/src/adapters/hub_mixin.py b/src/adapters/hub_mixin 2.py
similarity index 74%
rename from src/adapters/hub_mixin.py
rename to src/adapters/hub_mixin 2.py
index c23c92eb7..7a1009c5b 100644
--- a/src/adapters/hub_mixin.py
+++ b/src/adapters/hub_mixin 2.py	
@@ -1,5 +1,6 @@
 import logging
 import os
+import warnings
 from typing import List, Optional, Union
 
 from transformers.utils.generic import working_or_temp_dir
@@ -35,7 +36,7 @@
 from adapters import AutoAdapterModel
 
 model = AutoAdapterModel.from_pretrained("{model_name}")
-adapter_name = model.load_adapter("{adapter_repo_name}", set_active=True)
+adapter_name = model.load_adapter("{adapter_repo_name}", source="hf", set_active=True)
 ```
 
 ## Architecture & Training
@@ -61,21 +62,28 @@ def _save_adapter_card(
         save_directory: str,
         adapter_name: str,
         adapter_repo_name: str,
+        adapterhub_tag: Optional[str] = None,
         datasets_tag: Optional[str] = None,
         tags: Optional[List[str]] = None,
         language: Optional[str] = None,
         license: Optional[str] = None,
         metrics: Optional[List[str]] = None,
-        **kwargs,
+        **kwargs
     ):
         # Key remains "adapter-transformers", see: https://github.com/huggingface/huggingface.js/pull/459
         all_tags = {"adapter-transformers"}
         datasets = set()
         # Dataset/ Task info
         dataset_name = None
+        if adapterhub_tag is None and datasets_tag is None:
+            raise ValueError("Either adapterhub_tag or datasets_tag must be specified.")
         if datasets_tag is not None:
             dataset_name = f"[{datasets_tag}](https://huggingface.co/datasets/{datasets_tag}/)"
             datasets.add(datasets_tag)
+        if adapterhub_tag is not None:
+            # adapterhub_tag overwrites datasets_tag
+            dataset_name = f"[{adapterhub_tag}](https://adapterhub.ml/explore/{adapterhub_tag}/)"
+            all_tags.add(f"adapterhub:{adapterhub_tag}")
 
         all_tags.add(self.config.model_type)
         if tags is not None:
@@ -115,8 +123,10 @@ def _save_adapter_card(
 
     def push_adapter_to_hub(
         self,
-        repo_id: str,
+        repo_name: str,
         adapter_name: str,
+        organization: Optional[str] = None,
+        adapterhub_tag: Optional[str] = None,
         datasets_tag: Optional[str] = None,
         local_path: Optional[str] = None,
         commit_message: Optional[str] = None,
@@ -127,15 +137,21 @@ def push_adapter_to_hub(
         revision: str = None,
         commit_description: str = None,
         adapter_card_kwargs: Optional[dict] = None,
+        **deprecated_kwargs,
     ):
         """Upload an adapter to HuggingFace's Model Hub.
 
         Args:
-            repo_id (str): The name of the repository on the model hub to upload to.
+            repo_name (str): The name of the repository on the model hub to upload to.
             adapter_name (str): The name of the adapter to be uploaded.
             organization (str, optional): Organization in which to push the adapter
                 (you must be a member of this organization). Defaults to None.
-            datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. Defaults to
+            adapterhub_tag (str, optional):
+                Tag of the format `<task>/<subtask>` for categorization on https://adapterhub.ml/explore/. See
+                https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified,
+                `datasets_tag` must be given in case a new adapter card is generated. Defaults to None.
+            datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets.
+                If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to
                 None.
             local_path (str, optional): Local path used as clone directory of the adapter repository.
                 If not specified, will create a temporary directory. Defaults to None.
@@ -160,6 +176,31 @@ def push_adapter_to_hub(
         Returns:
             str: The url of the adapter repository on the model hub.
         """
+        use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in future versions of Adapters."
+                " Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if organization is not None and not repo_name.startswith(organization):
+            warnings.warn(
+                "The `organization` argument is deprecated and will be removed in future versions of"
+                " Adapters. Set your organization directly in the `repo_id` passed instead"
+                " (`repo_id={organization}/{model_id}`)."
+            )
+            if "/" in repo_name:
+                repo_name = repo_name.split("/")[-1]
+            repo_id = f"{organization}/{repo_name}"
+        else:
+            repo_id = repo_name
+
         use_temp_dir = not os.path.isdir(local_path) if local_path else True
 
         # Create repo or get retrieve an existing repo
@@ -177,6 +218,7 @@ def push_adapter_to_hub(
                     work_dir,
                     adapter_name,
                     repo_id,
+                    adapterhub_tag=adapterhub_tag,
                     datasets_tag=datasets_tag,
                     **adapter_card_kwargs,
                 )
diff --git a/src/adapters/loading.py b/src/adapters/loading 2.py
similarity index 98%
rename from src/adapters/loading.py
rename to src/adapters/loading 2.py
index b1918b0a0..8d730680f 100644
--- a/src/adapters/loading.py
+++ b/src/adapters/loading 2.py	
@@ -507,7 +507,7 @@ def load(
         loading_info=None,
         leave_out=None,
         set_active=False,
-        **kwargs,
+        **kwargs
     ):
         """
         Loads a pre-trained pytorch adapter module from the local file system or a remote location.
@@ -518,9 +518,9 @@ def load(
                 - the identifier of a pre-trained task adapter to be loaded from Adapter Hub
                 - a path to a directory containing adapter weights saved using `model.saved_adapter()`
                 - a URL pointing to a zip folder containing a saved adapter module
-            config (str, optional): Deprecated.
+            config (str, optional): The requested configuration of the adapter.
             version (str, optional): The version of the adapter to be loaded.
-            model_name (str, optional): Deprecated.
+            model_name (str, optional): The string identifier of the pre-trained model.
             load_as (str, optional): Load the adapter using this name. By default, the name with which the adapter was
              saved will be used.
 
@@ -528,13 +528,6 @@ def load(
             Tuple[str, str]: A tuple consisting of the local file system directory from which the weights where loaded
             and the name of the loaded weights.
         """
-        # Warn about deprecated arguments
-        if config is not None or model_name is not None:
-            logger.warning(
-                "The 'config' and 'model_name' arguments are specific to the now unsupported legacy Hub repo and will"
-                " be removed."
-                "Please switch to only providing the HF Model Hub identifier.",
-            )
         requested_config = AdapterConfig.load(config) if config else None
         # Resolve the weights to be loaded based on the given identifier and the current adapter config
         model_name = self.model.model_name or model_name
diff --git a/src/adapters/methods/adapter_layer_base.py b/src/adapters/methods/adapter_layer_base.py
index b03dd5e9c..04aa24927 100644
--- a/src/adapters/methods/adapter_layer_base.py
+++ b/src/adapters/methods/adapter_layer_base.py
@@ -90,25 +90,21 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         """
         raise NotImplementedError()
 
-    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy, **kwargs) -> bool:
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         """Averages a set of adapter modules into a new adapter module.
 
         Args:
             adapter_name (str): The name of the new (averaged) adapter module to add.
-            input_adapters (Dict[str, float]): Dictionary of adapter names and their corresponding weights.
-            combine_strategy (str): The strategy to combine the adapters. Available strategies depend on the used adapter method, see: https://docs.adapterhub.ml/adapter_composition.html#merging-adapters
-            **kwargs: Additional arguments that are specific to the combine_strategy. E.g. svd_rank for LoRA.
+            input_adapters (Dict[str, float]): Either:
+                - a list of adapter names (with equal weighting).
+                - a dictionary of adapter names and their corresponding weights.
 
         Returns:
             bool: True if the adapter was added, False otherwise.
         """
         # add new adapter
         if self.add_adapter(adapter_name, self.layer_idx):
-            if combine_strategy != "linear":
-                # You get the adapter type from the input adapters
-                raise ValueError(f"Combine strategy {combine_strategy} not supported for the chosen adapter methods.")
-
-            # average weights linearly
+            # average weights
             avg_state_dict = {}
             for name, weight in input_adapters.items():
                 if name in self.adapter_modules:
@@ -121,10 +117,8 @@ def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float], c
                 else:
                     self.delete_adapter(adapter_name)  # clean up before raising error
                     raise ValueError("Adapter {} not found.".format(name))
-
             # load averaged weights
             self.adapter_modules[adapter_name].load_state_dict(avg_state_dict)
-
             return True
 
         return False
diff --git a/src/adapters/methods/bottleneck.py b/src/adapters/methods/bottleneck.py
index fa66a095e..b3125c696 100644
--- a/src/adapters/methods/bottleneck.py
+++ b/src/adapters/methods/bottleneck.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, NamedTuple, Optional, Union
+from typing import Dict, List, Mapping, NamedTuple, Optional, Union
 
 import torch
 from torch import nn
@@ -94,6 +94,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
 
         return False
 
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
+        # add new adapter
+        if self.add_adapter(adapter_name, self.layer_idx):
+            # average weights
+            avg_state_dict = {}
+            for name, weight in input_adapters.items():
+                if name in self.adapters:
+                    module = self.adapters[name]
+                    for k, v in module.state_dict().items():
+                        if k in avg_state_dict:
+                            avg_state_dict[k] += weight * v
+                        else:
+                            avg_state_dict[k] = weight * v
+                else:
+                    self.delete_adapter(adapter_name)  # clean up before raising error
+                    raise ValueError("Adapter {} not found.".format(name))
+            # load averaged weights
+            self.adapters[adapter_name].load_state_dict(avg_state_dict)
+            return True
+
+        return False
+
     def add_fusion_layer(self, adapter_names: Union[List, str]):
         """See BertModel.add_fusion_layer"""
         adapter_names = adapter_names if isinstance(adapter_names, list) else adapter_names.split(",")
@@ -173,11 +195,9 @@ def pad_and_concat(self, states: List[BottleneckState]) -> BottleneckState:
             torch.cat([state.input_tensor for state in states], dim=0),
             torch.cat([state.adapter_residual for state in states], dim=0),
             states[0].layer_norm,
-            (
-                torch.cat([state.bottleneck_up for state in states], dim=0)
-                if states[0].bottleneck_up is not None
-                else None
-            ),
+            torch.cat([state.bottleneck_up for state in states], dim=0)
+            if states[0].bottleneck_up is not None
+            else None,
             states[-1].last,
         )
 
diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py
index c62a94f26..e54042b55 100644
--- a/src/adapters/methods/lora.py
+++ b/src/adapters/methods/lora.py
@@ -224,109 +224,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
 
         return False
 
-    def average_adapter(
-        self,
-        adapter_name: str,
-        input_adapters: Dict[str, float],
-        combine_strategy: str,
-        svd_rank: int = None,
-        **kwargs,
-    ) -> bool:
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         # add new adapter
         if self.add_adapter(adapter_name, self.layer_idx):
+            # average weights
             avg_state_dict = {}
-
-            # First, check if all input adapters are present
-            for name in input_adapters.keys():
-                if name not in self.loras:
-                    self.delete_adapter(adapter_name)  # clean up before raising error
-                    raise ValueError("Adapter {} not found.".format(name))
-
-            # Now, combine the weights according to the strategy
-            if combine_strategy == "linear":
-                for name, weight in input_adapters.items():
+            for name, weight in input_adapters.items():
+                if name in self.loras:
                     module = self.loras[name]
                     for k, v in module.state_dict().items():
                         if k in avg_state_dict:
                             avg_state_dict[k] += weight * v
                         else:
                             avg_state_dict[k] = weight * v
-
-            elif combine_strategy == "lora_linear_only_negate_b":
-                # Same as linear but for negative weights only negate the B matrix and leave A positive
-                # See Zhang et al. (2023) https://proceedings.neurips.cc/paper_files/paper/2023/hash/299a08ee712d4752c890938da99a77c6-Abstract-Conference.html
-                for name, weight in input_adapters.items():
-                    module = self.loras[name]
-                    for k, v in module.state_dict().items():
-                        if "lora_B" in k:
-                            zhang_weight = weight
-                        elif "lora_A" in k:
-                            zhang_weight = abs(weight)
-                        else:
-                            # This should never happen as we only have lora_A and lora_B in the state_dict
-                            raise ValueError(
-                                f"Key must either contain 'lora_A' or 'lora_B' but is {k}. This should never"
-                                " happen. Please open an issue on GitHub if you encounter this error."
-                            )
-
-                        if k in avg_state_dict:
-                            avg_state_dict[k] += zhang_weight * v
-                        else:
-                            avg_state_dict[k] = zhang_weight * v
-
-            elif combine_strategy == "lora_delta_w_svd":
-                # Weight the delta_w matrices by the input weights and then use Singular Value Decomposition (SVD) to split them into A and B matrices.
-                self._average_adapter_lora_delta_w_svd(input_adapters, avg_state_dict, svd_rank)
-
-            else:
-                raise ValueError(f"The combine_strategy '{combine_strategy}' is not supported for LoRA.")
-
+                else:
+                    self.delete_adapter(adapter_name)  # clean up before raising error
+                    raise ValueError("Adapter {} not found.".format(name))
             # load averaged weights
             self.loras[adapter_name].load_state_dict(avg_state_dict)
             return True
 
         return False
 
-    def _average_adapter_lora_delta_w_svd(self, input_adapters: Dict[str, float], avg_state_dict, svd_rank):
-        # Weight the delta_w matrices by the input weights and then use Singular Value Decomposition to split them into A and B matrices.
-        if svd_rank is None:
-            raise ValueError("svd_rank must be set when using 'lora_delta_w_svd'.")
-
-        # Collect delta_w matrices. Shape of every delta_w matrix in the list: d×k
-        delta_w = [self.loras[adapter_name].delta_w for adapter_name in input_adapters.keys()]
-
-        # If the lora has fan_in_fan_out, we need to transpose the matrices
-        if self.fan_in_fan_out:
-            delta_w = [torch.t(delta_w) for delta_w in delta_w]
-
-        delta_w = torch.stack(delta_w, dim=0)  # shape: n×d×k
-
-        # Weight the delta_w matrices
-        weights = torch.tensor(list(input_adapters.values()), device=delta_w.device)  # shape: n
-        weights = weights.view(-1, 1, 1)  # shape: n×1×1
-        delta_w = delta_w * weights  # shape: n×d×k
-
-        # Now bring down to d×k matrix
-        delta_w = delta_w.sum(dim=0)  # shape: d×k
-
-        # Perform SVD to split delta_w into A and B matrices
-        U, S_diag, V = torch.linalg.svd(delta_w)
-
-        # Reduce rank
-        U = U[:, :svd_rank]  # U is 2D
-        S_diag = S_diag[:svd_rank]  # S_diag is 1D
-        V = V[:svd_rank, :]  # V is 2D
-
-        # The SVD has decomposed delta_w into U, S, and V such that: delta_w = U @ S_diag @ V
-        # In LoRA we have: delta_w = B @ A
-        # Hence, we can set: A = V and B = U @ S_diag
-        if self.fan_in_fan_out:
-            avg_state_dict["lora_A"] = torch.t(V)
-            avg_state_dict["lora_B"] = torch.t(U @ torch.diag(S_diag))
-        else:
-            avg_state_dict["lora_A"] = V
-            avg_state_dict["lora_B"] = U @ torch.diag(S_diag)
-
 
 class LoRAState(NamedTuple):
     """Models the input and output states of a LoRA layer.
@@ -370,7 +289,7 @@ def __init__(
         attn_key: str = None,
         fan_in_fan_out: bool = False,
         no_init_bias: bool = False,
-        **kwargs,
+        **kwargs
     ):
         if no_init_bias and "bias" not in kwargs:
             kwargs["bias"] = False
@@ -391,7 +310,7 @@ def wrap(
         model_config: PretrainedConfig,
         adapters_config: ModelAdaptersConfig,
         attn_key: str = None,
-        **kwargs,
+        **kwargs
     ):
         if isinstance(module, Conv1D):
             new_module = LoRALinearTorch(
@@ -489,11 +408,9 @@ def repeat(self, state: LoRAState, channels: int) -> LoRAState:
     def mean(self, states: List[LoRAState], weights: torch.Tensor) -> LoRAState:
         return LoRAState(
             states[0].layer_input,
-            (
-                torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0)
-                if states[0].hidden_states is not None
-                else None
-            ),
+            torch.mean(torch.stack([s.hidden_states for s in states], dim=0) * weights, dim=0)
+            if states[0].hidden_states is not None
+            else None,
             states[0].layer_output,
             states[-1].last,
         )
@@ -635,7 +552,7 @@ def __init__(
         adapters_config: ModelAdaptersConfig,
         fan_in_fan_out: bool = False,
         no_init_bias: bool = False,
-        **kwargs,
+        **kwargs
     ):
         if no_init_bias and "bias" not in kwargs:
             kwargs["bias"] = False
@@ -654,7 +571,7 @@ def wrap(
         location_key: str,
         model_config: PretrainedConfig,
         adapters_config: ModelAdaptersConfig,
-        **kwargs,
+        **kwargs
     ):
         if isinstance(module, Conv1D):
             new_module = cls(
diff --git a/src/adapters/methods/prefix_tuning.py b/src/adapters/methods/prefix_tuning.py
index 1f7d4094b..5e98ca266 100644
--- a/src/adapters/methods/prefix_tuning.py
+++ b/src/adapters/methods/prefix_tuning.py
@@ -186,14 +186,8 @@ def confirm_prefix(self, prefix_name: str) -> bool:
         del self.prefix_counts[prefix_name]
         return True
 
-    def average_prefix(
-        self, prefix_name: str, input_adapters: Dict[str, float], combine_strategy: str, **kwargs
-    ) -> bool:
+    def average_prefix(self, prefix_name: str, input_adapters: Dict[str, float]) -> bool:
         if self.confirm_prefix(prefix_name):
-            # Prefix Tuning only support linear combination
-            if combine_strategy != "linear":
-                raise ValueError(f"Combine strategy {combine_strategy} not supported for prefix tuning.")
-
             # average weights
             avg_state_dict = {}
             for name, weight in input_adapters.items():
@@ -343,15 +337,9 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
 
         return False
 
-    def average_adapter(
-        self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str, **kwargs
-    ) -> bool:
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         # add new adapter
         if self.add_adapter(adapter_name, self.layer_idx):
-            # Prefix Tuning only support linear combination
-            if combine_strategy != "linear":
-                raise ValueError(f"Combine strategy {combine_strategy} not supported for prefix tuning.")
-
             # prefix averaging is handled in pool, only average gates here
             if adapter_name in self.prefix_gates:
                 avg_state_dict = {}
@@ -442,8 +430,10 @@ def pad_and_concat(self, states: List[PrefixTuningState]) -> PrefixTuningState:
             value_states = F.pad(value_states, pad_size, "constant", self.model_config.pad_token_id)
 
             # pad attention mask
-            if pad_length > 0 and attention_mask is not None:
+            if pad_length > 0:
                 # Masking the padded tokens only works correctly if attention_mask is set
+                # We assume this to be the case at this point
+                assert attention_mask is not None, "Attention mask must be set for prefix tuning"
                 attention_mask = F.pad(
                     attention_mask,
                     (max_prefix_length - attention_mask.shape[-1], 0),
diff --git a/src/adapters/methods/prompt_tuning.py b/src/adapters/methods/prompt_tuning.py
index aaf729d9f..8ac2b5fff 100644
--- a/src/adapters/methods/prompt_tuning.py
+++ b/src/adapters/methods/prompt_tuning.py
@@ -1,7 +1,7 @@
 # https://github.com/google-research/prompt-tuning/blob/main/prompt_tuning/train/prompts.py
 
 import math
-from typing import Callable
+from typing import Callable, Dict
 
 import numpy as np
 import torch
@@ -161,6 +161,28 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
 
         return False
 
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
+        # add new adapter
+        if self.add_adapter(adapter_name, -1):
+            # average weights
+            avg_state_dict = {}
+            for name, weight in input_adapters.items():
+                if name in self.prompt_tunings:
+                    module = self.prompt_tunings[name]
+                    for k, v in module.state_dict().items():
+                        if k in avg_state_dict:
+                            avg_state_dict[k] += weight * v
+                        else:
+                            avg_state_dict[k] = weight * v
+                else:
+                    self.delete_adapter(adapter_name)  # clean up before raising error
+                    raise ValueError("Adapter {} not found.".format(name))
+            # load averaged weights
+            self.prompt_tunings[adapter_name].load_state_dict(avg_state_dict)
+            return True
+
+        return False
+
     def forward(self, hidden_states: torch.Tensor):
         prefix_attention_mask_length = None
         adapter_setup = self.get_active_setup()
diff --git a/src/adapters/model_mixin.py b/src/adapters/model_mixin 2.py
similarity index 86%
rename from src/adapters/model_mixin.py
rename to src/adapters/model_mixin 2.py
index 180259581..c6c2ab597 100644
--- a/src/adapters/model_mixin.py
+++ b/src/adapters/model_mixin 2.py	
@@ -1,19 +1,16 @@
 import inspect
 import logging
 import os
+import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from copy import deepcopy
 from os.path import join
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
-from adapters.configuration.adapter_config import ConfigUnion, LoRAConfig
-from transformers import GenerationConfig
 from transformers.modeling_outputs import ModelOutput
-from transformers.utils import is_accelerate_available
 
 from .composition import AdapterCompositionBlock, Fuse, Stack, parse_composition
 from .configuration import ADAPTER_CONFIG_MAP, AdapterConfig, AdapterFusionConfig, BnConfig
@@ -33,9 +30,6 @@
 
 logger = logging.getLogger(__name__)
 
-if is_accelerate_available():
-    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
 
 class InvertibleAdaptersMixin:
     """Mixin for Transformer models adding invertible adapters."""
@@ -85,17 +79,9 @@ def add_invertible_adapter(self, adapter_name: str) -> bool:
 
         return False
 
-    def _average_invertible_adapter(
-        self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str
-    ) -> bool:
+    def _average_invertible_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         # add new adapter
         if self.add_invertible_adapter(adapter_name):
-            if combine_strategy != "linear":
-                raise ValueError(
-                    f"Combine strategy {combine_strategy} not supported for invertible adapters. Only 'linear' is"
-                    " supported."
-                )
-
             # average weights
             avg_state_dict = {}
             for name, weight in input_adapters.items():
@@ -185,13 +171,9 @@ def add_invertible_adapter(self, adapter_name: str) -> bool:
             return self.invertible_adapters_base.add_invertible_adapter(adapter_name)
         return False
 
-    def _average_invertible_adapter(
-        self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str
-    ) -> bool:
+    def _average_invertible_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         if self.invertible_adapters_base is not None:
-            return self.invertible_adapters_base._average_invertible_adapter(
-                adapter_name, input_adapters, combine_strategy
-            )
+            return self.invertible_adapters_base._average_invertible_adapter(adapter_name, input_adapters)
         return False
 
     def delete_invertible_adapter(self, adapter_name: str):
@@ -384,9 +366,6 @@ class ModelAdaptersMixin(PushAdapterToHubMixin, ABC):
     """Mixin for transformer models adding support for loading/ saving adapters."""
 
     add_base_adapters = False
-    support_lora_delta_w_svd = (
-        True  # If True, the model supports the "lora_delta_w_svd" combine_strategy to merge adapter weights.
-    )
     support_prompt_tuning = True  # If False, the prompt tuning layer is not added to the model. If True, the prompt tuning layer is added if add_base_adapters is True.
 
     def __init__(self, config, *args, **kwargs):
@@ -504,6 +483,14 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra
             self.get_input_embeddings().train()
             self.get_input_embeddings().weight.requires_grad = True
 
+    def train_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False):
+        """Sets the model into mode for training of adapter fusion determined by a list of adapter names."""
+        warnings.warn(
+            "add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.",
+            FutureWarning,
+        )
+        self.train_adapter_fusion(adapter_setup, unfreeze_adapters=unfreeze_adapters)
+
     def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False):
         """Sets the model into mode for training of adapter fusion determined by a list of adapter names."""
         self.train()
@@ -624,6 +611,14 @@ def _add_adapter_weights(self, adapter_name: str):
         if isinstance(self, InvertibleAdaptersMixin) or isinstance(self, InvertibleAdaptersWrapperMixin):
             self.add_invertible_adapter(adapter_name)
 
+    def add_fusion(self, adapter_names: Union[Fuse, list], adapter_fusion_config=None, override_kwargs=None):
+        warnings.warn(
+            "add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.",
+            FutureWarning,
+        )
+        adapter_fusion_config = AdapterFusionConfig.from_dict(adapter_fusion_config).replace(**override_kwargs)
+        self.add_adapter_fusion(adapter_names, adapter_fusion_config)
+
     def add_adapter_fusion(
         self,
         adapter_names: Union[Fuse, list, str],
@@ -788,12 +783,13 @@ def load_adapter(
         version: str = None,
         model_name: str = None,
         load_as: str = None,
+        source: str = None,
         custom_weights_loaders: Optional[List[WeightsLoader]] = None,
         leave_out: Optional[List[int]] = None,
         id2label=None,
         set_active: bool = False,
         use_safetensors: bool = False,
-        **kwargs,
+        **kwargs
     ) -> str:
         """
         Loads a pre-trained pytorch adapter module from the local file system or a remote location.
@@ -804,11 +800,20 @@ def load_adapter(
                 - the identifier of a pre-trained task adapter to be loaded from Adapter Hub
                 - a path to a directory containing adapter weights saved using `model.saved_adapter()`
                 - a URL pointing to a zip folder containing a saved adapter module
-            config (dict or str, optional): Deprecated.
+            config (dict or str, optional): The requested configuration of the adapter.
+                If not specified, will be either: - the default adapter config for the requested adapter if specified -
+                the global default adapter config
             version (str, optional): The version of the adapter to be loaded.
-            model_name (str, optional): Deprecated.
+            model_name (str, optional): The string identifier of the pre-trained model.
             load_as (str, optional): Load the adapter using this name. By default, the name with which the adapter was
                     saved will be used.
+            source (str, optional): Identifier of the source(s) from where to load the adapter. Can be:
+
+                - "ah": search on AdapterHub Hub repo.
+                    Note: the Hub repo has been archived and all adapters have been moved to HuggingFace Model Hub.
+                    Loading from this source is deprecated.
+                - "hf": search on HuggingFace Model Hub.
+                - None (default): search on all sources
             leave_out: Dynamically drop adapter modules in the specified Transformer layers when loading the adapter.
             set_active (bool, optional):
                 Set the loaded adapter to be the active one. By default (False), the adapter is loaded but not
@@ -825,6 +830,7 @@ def load_adapter(
             version,
             model_name,
             load_as,
+            source=source,
             leave_out=leave_out,
             set_active=set_active,
             **kwargs,
@@ -849,7 +855,7 @@ def load_adapter_fusion(
         custom_weights_loaders: Optional[List[WeightsLoader]] = None,
         set_active: bool = False,
         use_safetensors: bool = False,
-        **kwargs,
+        **kwargs
     ) -> str:
         """
         Loads a pre-trained AdapterFusion layer from the local file system.
@@ -1162,12 +1168,7 @@ def adapter_summary(self, as_dict=False) -> Union[str, dict]:
             s.append("=" * total_length)
             return "\n".join(s)
 
-    def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str, float], combine_strategy: str):
-        if combine_strategy != "linear":
-            raise ValueError(
-                f"Combine strategy {combine_strategy} not supported for shared parameters. Only 'linear' is supported."
-            )
-
+    def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str, float]):
         avg_state_dict = {}
         for name, weight in input_adapters.items():
             if name in self.base_model.shared_parameters:
@@ -1181,109 +1182,38 @@ def _average_shared_parameters(self, adapter_name: str, input_adapters: Dict[str
                 raise ValueError(f"Adapter {name} not found in shared parameters.")
         self.base_model.shared_parameters[adapter_name] = nn.ParameterDict(avg_state_dict)
 
-    def _pre_average_adapter_checks(
-        self,
-        adapter_name: str,
-        adapter_list: List[str],
-        combine_strategy: str,
-        valid_combination_strategies: List[str],
-        is_head=False,
-    ):
-        # Check if combine_strategy is valid
-        if combine_strategy not in valid_combination_strategies:
-            raise ValueError(
-                f"Invalid combine_strategy '{combine_strategy}'. Must be one of {valid_combination_strategies}"
-            )
-
-        # Some strategies are not supported by all models
-        if combine_strategy == "lora_delta_w_svd" and not self.base_model.support_lora_delta_w_svd:
-            raise ValueError(
-                "This model specifically does not support 'lora_delta_w_svd' as a merging method. Please use a"
-                " different combine_strategy or a different model."
-            )
-
-        head_or_adapter = "head" if is_head else "adapter"
-
-        # Provide the user with some information about the adapters to be averaged
-        logging.info(f"Creating new {head_or_adapter} called {adapter_name} by averaging {adapter_list}.")
-        if not is_head:
-            logging.info("In case you want to create a new head as well please use the `average_head` function.")
-
-        if len(adapter_list) == 0:
-            raise ValueError("No adapters to average. Please provide at least one adapter to average.")
-        if len(adapter_list) == 1:
-            logging.info(
-                "You provided only one adapter to average. If you set `normalize_weights` to true, this will result in"
-                " duplicating the adapter. If not this will result in scaling the adapter weights. We will use the"
-                " linear combination strategy for this."
-            )
-
-        # For ConfigUnion, only support linear combination
-        if isinstance(self.adapters_config.get(adapter_list[0]), ConfigUnion):
-            if combine_strategy != "linear":
-                raise ValueError(
-                    "Combining adapters with ConfigUnion is only supported with the 'linear' combine_strategy."
-                )
-
     def average_adapter(
         self,
         adapter_name: str,
-        adapter_list: Union[List[str], Dict[str, float]],
+        adapter_list: List[str],
         weights: Optional[List[float]] = None,
-        combine_strategy: str = "linear",
         normalize_weights: bool = True,
         overwrite_ok: bool = False,
         set_active: bool = False,
-        svd_rank: int = None,  # if other combination strategies are implemented that need new parameters, this should be moved to **kwargs
     ):
         """
         Adds a new adapter module as weighted average of a set of existing adapter modules.
 
         Args:
             adapter_name (str): The name of the adapter module to be added.
-            adapter_list (List[str] or Dict[str, float]):
+            input_adapters (List[str] or Dict[str, float]):
                 Specifies the existing adapters whose weights should be averaged. Can either be a list of adapter names
                 or a dictionary mapping adapter names to weights.
-            weights (Optional[List[float]], optional): The weights corresponding to each adapter module in the list.
-                If not provided, equal weights will be assigned to each adapter.
-            combine_strategy (str, optional): The strategy to combine the adapter modules.
-                Available options are "linear", "lora_linear_only_negate_b", and "lora_delta_w_svd".
-                See https://docs.adapterhub.ml/adapter_composition.html#merging-adapters
-                Defaults to "linear".
-            normalize_weights (bool, optional): Whether to normalize the weights.
-                If True, the weights will be normalized to sum up to 1.
-                Defaults to True.
             overwrite_ok (bool, optional):
                 Overwrite an adapter with the same name if it exists. By default (False), an exception is thrown.
             set_active (bool, optional):
                 Set the adapter to be the active one. By default (False), the adapter is added but not activated.
-            svd_rank (int, optional): The rank to be used for Singular Value Decomposition (SVD) when averaging LoRA adapters.
-                This parameter is only applicable when the combine_strategy is set to "lora_delta_w_svd".
-                Defaults to None.
         """
-
-        valid_combination_strategies = ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"]
-        self._pre_average_adapter_checks(adapter_name, adapter_list, combine_strategy, valid_combination_strategies)
-
+        # To be able to average the weights, all adapter configs must be the same
         config = None
         for name in adapter_list:
             if config is None:
                 config = self.adapters_config.get(name)
-            elif get_adapter_config_hash(config, ignore_params=["dropout", "init_weights"]) != get_adapter_config_hash(
-                self.adapters_config.get(name), ignore_params=["dropout", "init_weights"]
-            ):
+            elif get_adapter_config_hash(config) != get_adapter_config_hash(self.adapters_config.get(name)):
                 raise ValueError(
                     "Cannot average adapters with different configurations. "
                     "Please make sure all adapters have the same configuration."
                 )
-
-        # In case svd_rank is set, change the config to use the new rank
-        if svd_rank is not None:
-            if isinstance(config, LoRAConfig):
-                config = config.replace(r=svd_rank)
-            else:
-                logging.warning("SVD rank can only be set when averaging LoRA adapters. Ignoring svd_rank.")
-
         # In case adapter already exists and we allow overwriting, explicitly delete the existing one first
         if overwrite_ok and adapter_name in self.adapters_config:
             self.delete_adapter(adapter_name)
@@ -1299,25 +1229,17 @@ def average_adapter(
                 sum_weights = 1.0
             input_adapters = {name: weight / sum_weights for name, weight in zip(adapter_list, weights)}
         try:
-            self.apply_to_adapter_layers(
-                lambda i, layer: layer.average_adapter(
-                    adapter_name, input_adapters, combine_strategy, svd_rank=svd_rank
-                )
-            )
-            self.apply_to_basemodel_childs(
-                lambda i, child: child.average_adapter(
-                    adapter_name, input_adapters, combine_strategy, svd_rank=svd_rank
-                )
-            )
+            self.apply_to_adapter_layers(lambda i, layer: layer.average_adapter(adapter_name, input_adapters))
+            self.apply_to_basemodel_childs(lambda i, child: child.average_adapter(adapter_name, input_adapters))
             # PHM Layer
             if self.adapters_config.match(adapter_name, BnConfig, location_key="phm_layer"):
-                self._average_shared_parameters(adapter_name, input_adapters, combine_strategy)
+                self._average_shared_parameters(adapter_name, input_adapters)
             # Prefix Tuning
             for module in self.modules():
                 if isinstance(module, PrefixTuningPool):
-                    module.average_prefix(adapter_name, input_adapters, combine_strategy)
+                    module.average_prefix(adapter_name, input_adapters)
             if isinstance(self, InvertibleAdaptersMixin) or isinstance(self, InvertibleAdaptersWrapperMixin):
-                self._average_invertible_adapter(adapter_name, input_adapters, combine_strategy)
+                self._average_invertible_adapter(adapter_name, input_adapters)
         except ValueError as ex:
             self.delete_adapter(adapter_name)
             raise ex
@@ -1358,21 +1280,10 @@ def reset_adapter(self):
 
     # HACK Copied from transformers/generation/utils.py
     def _prepare_encoder_decoder_kwargs_for_generation(
-        self,
-        inputs_tensor: torch.Tensor,
-        model_kwargs,
-        model_input_name: Optional[str],
-        generation_config: GenerationConfig,
+        self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
     ) -> Dict[str, Any]:
         # 1. get encoder
         encoder = self.get_encoder()
-        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
-        # as the inputs.
-        if hasattr(self, "hf_device_map"):
-            if hasattr(encoder, "_hf_hook"):
-                encoder._hf_hook.io_same_device = True
-            else:
-                add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True))
 
         # 2. prepare encoder args and encoder kwargs from model kwargs
         irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
@@ -1381,6 +1292,7 @@ def _prepare_encoder_decoder_kwargs_for_generation(
             for argument, value in model_kwargs.items()
             if not any(argument.startswith(p) for p in irrelevant_prefix)
         }
+
         encoder_signature = set(inspect.signature(encoder.forward).parameters)
         encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
         if not encoder_accepts_wildcard:
@@ -1389,8 +1301,6 @@ def _prepare_encoder_decoder_kwargs_for_generation(
                 for argument, value in encoder_kwargs.items()
                 if argument in encoder_signature or argument == "adapter_input_parallelized"
             }
-        encoder_kwargs["output_attentions"] = generation_config.output_attentions
-        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
 
         # 3. make sure that encoder returns `ModelOutput`
         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
@@ -1545,12 +1455,12 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra
             super().train_adapter(adapter_setup, train_embeddings)
         else:
             self.base_model.train_adapter(adapter_setup, train_embeddings)
-
-        # If the head has tied weights with the embedding layer (e.g. masked language modeling head), the last layer is
-        # only trained when train_embeddings is set to True
         if not train_embeddings:
             self.freeze_embeddings()
 
+        # Hack to prevent HF Trainer from throwing an error due to peft missing.
+        self._hf_peft_config_loaded = True
+
     def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBlock], unfreeze_adapters=False):
         """
         Sets the model into mode for training of adapter fusion determined by a list of adapter names. If
@@ -1562,93 +1472,6 @@ def train_adapter_fusion(self, adapter_setup: Union[list, AdapterCompositionBloc
             self.base_model.train_adapter_fusion(adapter_setup, unfreeze_adapters=unfreeze_adapters)
         self.freeze_embeddings()
 
-    def average_head(
-        self,
-        head_name: str,
-        head_list: Union[List[str], Dict[str, float]],
-        weights: Optional[List[float]] = None,
-        normalize_weights: bool = True,
-        overwrite_ok: bool = False,
-        set_active: bool = False,
-    ):
-        """
-        Adds a new prediction head as a weighted average of a set of existing prediction heads.
-
-        Args:
-            head_name (str): The name of the new prediction head to be added.
-            head_list (List[str] or Dict[str, float]):
-                Specifies the existing heads whose weights should be averaged. Can either be a list of head names
-                or a dictionary mapping head names to weights.
-            weights (Optional[List[float]], optional): The weights corresponding to each head in the list.
-                If not provided, equal weights will be assigned to each head.
-            normalize_weights (bool, optional): Whether to normalize the weights.
-                If True, the weights will be normalized to sum up to 1.
-                Defaults to True.
-            overwrite_ok (bool, optional):
-                Overwrite a head with the same name if it exists. By default (False), an exception is thrown.
-            set_active (bool, optional):
-                Set the head to be the active one. By default (False), the head is added but not activated.
-        """
-
-        self._pre_average_adapter_checks(
-            head_name, head_list, "linear", ["linear"], is_head=True
-        )  # Currently, only linear averaging is supported for heads
-
-        # Ensure all heads to be averaged are of the same class
-        head_class = type(self.heads[head_list[0]])
-        for name in head_list:
-            if not isinstance(self.heads[name], head_class):
-                raise ValueError(
-                    f"Cannot average heads of different classes. All heads must be of type {head_class.__name__}."
-                )
-
-        # Ensure that all heads have the same configuration
-        head_config = self.heads[head_list[0]].config
-
-        for name in head_list:
-            if get_adapter_config_hash(head_config, ignore_params=["dropout_prob"]) != get_adapter_config_hash(
-                self.heads[name].config, ignore_params=["dropout_prob"]
-            ):
-                raise ValueError(
-                    "Cannot average heads with different configurations. "
-                    "Please make sure all heads have the same configuration."
-                )
-
-        # In case the head already exists and we allow overwriting, explicitly delete the existing one first
-        if overwrite_ok and head_name in self.heads:
-            self.delete_head(head_name)
-
-        # Now that we have ensured that all heads are of the same class and have the same configuration,
-        # we can add the new head by copy one of the existing heads and then replacing the weights
-        new_head = deepcopy(self.heads[head_list[0]])  # This is a PredictionHead
-        new_head.name = head_name
-
-        if weights is None:
-            eq_weight = 1.0 / len(head_list)
-            input_heads = {name: eq_weight for name in head_list}
-        else:
-            # Normalize weights if specified
-            if normalize_weights:
-                sum_weights = sum(weights)
-            else:
-                sum_weights = 1.0
-            input_heads = {name: weight / sum_weights for name, weight in zip(head_list, weights)}
-
-        # Average the state dictionaries of the heads
-        avg_state_dict = {}
-        for name, weight in input_heads.items():
-            for k, v in self.heads[name].state_dict().items():
-                if k in avg_state_dict:
-                    avg_state_dict[k] += weight * v
-                else:
-                    avg_state_dict[k] = weight * v
-
-        # Load the averaged state dictionary into the new head
-        new_head.load_state_dict(avg_state_dict)
-
-        # Add the new head to the model
-        self.add_prediction_head(new_head, set_active=set_active)
-
     def save_head(self, save_directory: str, head_name: str = None, use_safetensors: bool = False) -> None:
         """Saves a model prediction head to a directory such that it can be reloaded using `load_head()`.
 
@@ -1666,7 +1489,7 @@ def load_head(
         load_as: str = None,
         id2label: Dict[int, str] = None,
         use_safetensors: bool = False,
-        **kwargs,
+        **kwargs
     ) -> str:
         """Loads a model prediction head from a directory where it was saved using `save_head()`.
 
@@ -1715,13 +1538,14 @@ def load_adapter(
         version: str = None,
         model_name: str = None,
         load_as: str = None,
+        source: str = None,
         with_head: bool = True,
         custom_weights_loaders: Optional[List[WeightsLoader]] = None,
         leave_out: Optional[List[int]] = None,
         id2label=None,
         set_active: bool = False,
         use_safetensors: bool = False,
-        **kwargs,
+        **kwargs
     ) -> str:
         if with_head:
             if custom_weights_loaders is None:
@@ -1744,6 +1568,7 @@ def load_adapter(
             version=version,
             model_name=model_name,
             load_as=load_as,
+            source=source,
             custom_weights_loaders=custom_weights_loaders,
             leave_out=leave_out,
             id2label=id2label,
@@ -1829,7 +1654,7 @@ def load_adapter_fusion(
         set_active: bool = False,
         with_head: bool = True,
         use_safetensors: bool = False,
-        **kwargs,
+        **kwargs
     ) -> str:
         if with_head:
             if custom_weights_loaders is None:
diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index 8e759698d..6d54544c2 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -14,18 +14,11 @@
     CLIPTextTransformerAdaptersMixin,
     CLIPVisionModelAdaptersMixin,
 )
-from .deberta.mixin_deberta import DebertaModelAdaptersMixin
 from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin
 from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin
 from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin
 from .llama.mixin_llama import LlamaForQuestionAnsweringAdapterMixin, LlamaModelAdapterMixin
 from .mistral.mixin_mistral import MistralModelAdapterMixin
-from .plbart.mixin_plbart import (
-    PLBartDecoderAdaptersMixin,
-    PLBartDecoderWrapperAdaptersMixin,
-    PLBartEncoderAdaptersMixin,
-    PLBartModelAdaptersMixin,
-)
 from .t5.mixin_t5 import (
     T5BlockAdaptersMixin,
     T5ForCondiditionalGenerationWithHeadsMixin,
@@ -41,8 +34,8 @@
     "AlbertModel": AlbertModelAdaptersMixin,
     "BartEncoder": BartEncoderAdaptersMixin,
     "BartDecoder": BartDecoderAdaptersMixin,
-    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "BartModel": BartModelAdaptersMixin,
+    "BartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "BeitIntermediate": BeitIntermediateAdaptersMixin,
     "BeitOutput": BeitOutputAdaptersMixin,
     "BeitModel": BeitModelAdaptersMixin,
@@ -68,10 +61,6 @@
     "MT5ForConditionalGeneration": T5ForCondiditionalGenerationWithHeadsMixin,
     "MT5ForQuestionAnswering": T5ForQuestionAnsweringWithHeadsMixin,
     "MT5EncoderModel": T5ModelAdaptersMixin,
-    "PLBartEncoder": PLBartEncoderAdaptersMixin,
-    "PLBartDecoder": PLBartDecoderAdaptersMixin,
-    "PLBartModel": PLBartModelAdaptersMixin,
-    "PLBartDecoderWrapper": PLBartDecoderWrapperAdaptersMixin,
     "GPT2Model": GPT2ModelAdapterMixin,
     "GPTJMLP": GPTJMLPAdaptersMixin,
     "GPTJModel": GPTJModelAdapterMixin,
@@ -88,9 +77,9 @@
     "XLMRobertaModel": BertModelAdaptersMixin,
     "XmodLayer": BertLayerAdaptersMixin,
     "XmodModel": XmodModelAdaptersMixin,
-    "DebertaModel": DebertaModelAdaptersMixin,
+    "DebertaModel": BertModelAdaptersMixin,
     "DebertaLayer": BertLayerAdaptersMixin,
-    "DebertaV2Model": DebertaModelAdaptersMixin,
+    "DebertaV2Model": BertModelAdaptersMixin,
     "DebertaV2Layer": BertLayerAdaptersMixin,
     "BertGenerationEncoder": BertModelAdaptersMixin,
     "BertGenerationLayer": BertLayerAdaptersMixin,
diff --git a/src/adapters/models/albert/adapter_model.py b/src/adapters/models/albert/adapter_model.py
index 73892bb2f..8f6c07d47 100644
--- a/src/adapters/models/albert/adapter_model.py
+++ b/src/adapters/models/albert/adapter_model.py
@@ -52,7 +52,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
index 7ab5cd80f..6a962ff63 100644
--- a/src/adapters/models/auto/adapter_model.py
+++ b/src/adapters/models/auto/adapter_model.py
@@ -22,10 +22,9 @@
         ("gpt2", "GPT2AdapterModel"),
         ("gptj", "GPTJAdapterModel"),
         ("llama", "LlamaAdapterModel"),
-        ("mbart", "MBartAdapterModel"),
         ("mistral", "MistralAdapterModel"),
+        ("mbart", "MBartAdapterModel"),
         ("mt5", "MT5AdapterModel"),
-        ("plbart", "PLBartAdapterModel"),
         ("roberta", "RobertaAdapterModel"),
         ("t5", "T5AdapterModel"),
         ("vit", "ViTAdapterModel"),
diff --git a/src/adapters/models/bart/adapter_model.py b/src/adapters/models/bart/adapter_model.py
index 4e07fc5f1..384955cc1 100644
--- a/src/adapters/models/bart/adapter_model.py
+++ b/src/adapters/models/bart/adapter_model.py
@@ -67,7 +67,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
@@ -127,7 +127,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs,
+        **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
diff --git a/src/adapters/models/bart/modeling_bart.py b/src/adapters/models/bart/modeling_bart.py
index 080455b49..b347fddf0 100644
--- a/src/adapters/models/bart/modeling_bart.py
+++ b/src/adapters/models/bart/modeling_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BART model."""
+""" PyTorch BART model."""
 from typing import Optional, Tuple
 
 import torch
diff --git a/src/adapters/models/beit/modeling_beit.py b/src/adapters/models/beit/modeling_beit.py
index 865fcdeae..1ed5082be 100644
--- a/src/adapters/models/beit/modeling_beit.py
+++ b/src/adapters/models/beit/modeling_beit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BEiT model."""
+""" PyTorch BEiT model."""
 
 
 import math
@@ -33,9 +33,7 @@ def forward(
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-        relative_position_bias: Optional["BeitRelativePositionBias"] = None,
-        interpolate_pos_encoding: bool = False,
-        resolution: Optional[Tuple[int]] = None,
+        relative_position_bias: Optional[BeitRelativePositionBias] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
@@ -52,11 +50,7 @@ def forward(
 
         # Add relative position bias if present.
         if self.relative_position_bias is not None:
-            height, width = resolution
-            window_size = (height // self.config.patch_size, width // self.config.patch_size)
-            attention_scores = attention_scores + self.relative_position_bias(
-                window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1]
-            )
+            attention_scores = attention_scores + self.relative_position_bias().unsqueeze(0)
 
         # Add shared relative position bias if provided.
         if relative_position_bias is not None:
@@ -92,17 +86,13 @@ def forward(
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-        relative_position_bias: Optional["BeitRelativePositionBias"] = None,
-        interpolate_pos_encoding: bool = False,
-        resolution: Optional[Tuple[int]] = None,
+        relative_position_bias: Optional[BeitRelativePositionBias] = None,
     ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
         self_attention_outputs = self.attention(
             self.layernorm_before(hidden_states),  # in BEiT, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
             relative_position_bias=relative_position_bias,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            resolution=resolution,
         )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
diff --git a/src/adapters/models/bert/adapter_model.py b/src/adapters/models/bert/adapter_model.py
index a15f3e432..0b8e18943 100644
--- a/src/adapters/models/bert/adapter_model.py
+++ b/src/adapters/models/bert/adapter_model.py
@@ -54,7 +54,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
diff --git a/src/adapters/models/bert/modeling_bert.py b/src/adapters/models/bert/modeling_bert.py
index de860151e..ea60b6f5d 100644
--- a/src/adapters/models/bert/modeling_bert.py
+++ b/src/adapters/models/bert/modeling_bert.py
@@ -23,17 +23,13 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.models.bert.modeling_bert import BertOutput, BertSdpaSelfAttention, BertSelfAttention, BertSelfOutput
-from transformers.utils import logging
+from transformers.models.bert.modeling_bert import BertOutput, BertSelfAttention, BertSelfOutput
 
 from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
 from ...utils import prefix_attention_mask
 from .mixin_bert import BertOutputAdaptersMixin, BertSelfAttentionAdaptersMixin, BertSelfOutputAdaptersMixin
 
 
-logger = logging.get_logger(__name__)
-
-
 class BertSelfAttentionWithAdapters(BertSelfAttentionAdaptersMixin, BertSelfAttention):
     def forward(
         self,
@@ -146,107 +142,6 @@ def forward(
         return outputs
 
 
-class BertSdpaSelfAttentionWithAdapters(BertSelfAttentionAdaptersMixin, BertSdpaSelfAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        attention_mask = prefix_attention_mask(attention_mask, [2, 3])  # type: ignore
-
-        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
-            logger.warning_once(
-                "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
-                " non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to"
-                " the manual attention implementation, but specifying the manual implementation will be required from"
-                " Transformers version v5.0.0 onwards. This warning can be removed using the argument"
-                ' `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                past_key_value,
-                output_attentions,
-            )
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
-        # mask needs to be such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        current_states = encoder_hidden_states if is_cross_attention else hidden_states
-        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
-
-        # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
-        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
-            key_layer, value_layer = past_key_value
-        else:
-            key_layer = self.transpose_for_scores(self.key(current_states))
-            value_layer = self.transpose_for_scores(self.value(current_states))
-            if past_key_value is not None and not is_cross_attention:
-                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-
-        query_layer = self.transpose_for_scores(self.query(hidden_states))
-        query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer)
-        (attention_mask,) = adjust_tensors_for_parallel(query_layer, attention_mask)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        key_layer, value_layer, attention_mask = self.prefix_tuning(
-            key_layer, value_layer, hidden_states, attention_mask
-        )
-        (query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer)
-        bsz = query_layer.size(0)
-
-        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
-        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
-        # Reference: https://github.com/pytorch/pytorch/issues/112577
-        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
-            query_layer = query_layer.contiguous()
-            key_layer = key_layer.contiguous()
-            value_layer = value_layer.contiguous()
-
-        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal
-        # mask in case tgt_len == 1.
-        is_causal = self.is_decoder and attention_mask is None and tgt_len > 1
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_layer,
-            key_layer,
-            value_layer,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout_prob if self.training else 0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
-
-        outputs = (attn_output,)
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
-
-
 class BertSelfOutputWithAdapters(BertSelfOutputAdaptersMixin, BertSelfOutput):
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
diff --git a/src/adapters/models/bert_generation/adapter_model.py b/src/adapters/models/bert_generation/adapter_model.py
index d3822e24a..072c1b099 100644
--- a/src/adapters/models/bert_generation/adapter_model.py
+++ b/src/adapters/models/bert_generation/adapter_model.py
@@ -54,7 +54,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
diff --git a/src/adapters/models/clip/adapter_model.py b/src/adapters/models/clip/adapter_model.py
index 7734cd021..39382757e 100644
--- a/src/adapters/models/clip/adapter_model.py
+++ b/src/adapters/models/clip/adapter_model.py
@@ -42,7 +42,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         outputs, context = self.clip(
             input_ids=input_ids,
diff --git a/src/adapters/models/clip/modeling_clip.py b/src/adapters/models/clip/modeling_clip.py
index 7328e532c..b74a0308e 100644
--- a/src/adapters/models/clip/modeling_clip.py
+++ b/src/adapters/models/clip/modeling_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch CLIP model."""
+""" PyTorch CLIP model."""
 
 
 from typing import Optional, Tuple
@@ -21,25 +21,11 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.models.clip.modeling_clip import (
-    CLIPAttention,
-    CLIPEncoderLayer,
-    CLIPFlashAttention2,
-    CLIPSdpaAttention,
-)
-from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_2
-from transformers.utils import is_flash_attn_2_available, logging
-
-
-if is_flash_attn_2_available():
-    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.models.clip.modeling_clip import CLIPAttention, CLIPEncoderLayer
 
 from .mixin_clip import CLIPAttentionAdaptersMixin, CLIPEncoderLayerAdaptersMixin
 
 
-logger = logging.get_logger(__name__)
-
-
 class CLIPAttentionWithAdapters(CLIPAttentionAdaptersMixin, CLIPAttention):
     def forward(
         self,
@@ -60,11 +46,9 @@ def forward(
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
 
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
-        # >>> END AH Changes <<<
 
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
@@ -131,155 +115,6 @@ def forward(
         return attn_output, attn_weights_reshaped
 
 
-class CLIPFlashAttention2WithAdapters(CLIPAttentionAdaptersMixin, CLIPFlashAttention2):
-    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        output_attentions = False
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # >>> START AH Changes <<<
-        key_states, value_states, attention_mask = self.prefix_tuning(
-            key_states, value_states, hidden_states, attention_mask
-        )
-        # >>> END AH Changes <<<
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            is_causal=causal_attention_mask is not None,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-        )
-
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
-        attn_output = self.out_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
-
-
-class CLIPSdpaAttentionWithAdapters(CLIPAttentionAdaptersMixin, CLIPSdpaAttention):
-    # Adapted from CLIPAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "CLIPModel is using CLIPSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not "
-                "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying "
-                "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can "
-                'be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                causal_attention_mask=causal_attention_mask,
-                output_attentions=output_attentions,
-            )
-
-        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
-        if attention_mask is not None and causal_attention_mask is not None:
-            attn_mask = attention_mask + causal_attention_mask
-        elif causal_attention_mask is not None:
-            attn_mask = causal_attention_mask
-        else:
-            attn_mask = attention_mask
-
-        bsz, tgt_len, embed_dim = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # >>> START AH Changes <<<
-        key_states, value_states, attn_mask = self.prefix_tuning(key_states, value_states, hidden_states, attn_mask)
-        # >>> END AH Changes <<<
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # CLIP text model uses both `causal_attention_mask` and `attention_mask` sequentially.
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attn_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            scale=self.scale,
-        )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, None
-
-
 class CLIPEncoderLayerWithAdapters(CLIPEncoderLayerAdaptersMixin, CLIPEncoderLayer):
     def forward(
         self,
diff --git a/src/adapters/models/deberta/adapter_model.py b/src/adapters/models/deberta/adapter_model.py
index f5e15e8cb..32ec9cd45 100644
--- a/src/adapters/models/deberta/adapter_model.py
+++ b/src/adapters/models/deberta/adapter_model.py
@@ -45,7 +45,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
diff --git a/src/adapters/models/deberta/mixin_deberta.py b/src/adapters/models/deberta/mixin_deberta.py
index 817b9a83e..272302931 100644
--- a/src/adapters/models/deberta/mixin_deberta.py
+++ b/src/adapters/models/deberta/mixin_deberta.py
@@ -1,7 +1,6 @@
 from ...methods.lora import LoRAMergedLinear
 from ...methods.prefix_tuning import PrefixTuningLayer
 from ...utils import patch_forward
-from ..bert.mixin_bert import BertModelAdaptersMixin
 
 
 class DebertaSelfAttentionAdaptersMixin:
@@ -15,8 +14,3 @@ def init_adapters(self, model_config, adapters_config):
             self.location_key + "_prefix" if self.location_key else None, model_config, adapters_config
         )
         patch_forward(self)
-
-
-class DebertaModelAdaptersMixin(BertModelAdaptersMixin):
-    # Same as BERT, except that Deberta does not support the "lora_delta_w_svd" combine_strategy
-    support_lora_delta_w_svd = False
diff --git a/src/adapters/models/deberta/modeling_deberta.py b/src/adapters/models/deberta/modeling_deberta.py
index 4380b5e03..1feca72b4 100644
--- a/src/adapters/models/deberta/modeling_deberta.py
+++ b/src/adapters/models/deberta/modeling_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch DeBERTa model."""
+""" PyTorch DeBERTa model."""
 
 import torch
 import torch.utils.checkpoint
diff --git a/src/adapters/models/deberta_v2/adapter_model.py b/src/adapters/models/deberta_v2/adapter_model.py
index 07092debd..c306f8f47 100644
--- a/src/adapters/models/deberta_v2/adapter_model.py
+++ b/src/adapters/models/deberta_v2/adapter_model.py
@@ -47,7 +47,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
diff --git a/src/adapters/models/deberta_v2/modeling_deberta_v2.py b/src/adapters/models/deberta_v2/modeling_deberta_v2.py
index bc41ae82a..56d6fec44 100644
--- a/src/adapters/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/adapters/models/deberta_v2/modeling_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch DeBERTa-v2 model."""
+""" PyTorch DeBERTa-v2 model."""
 
 import torch
 import torch.utils.checkpoint
diff --git a/src/adapters/models/distilbert/adapter_model.py b/src/adapters/models/distilbert/adapter_model.py
index 3f38c893c..c28f12440 100644
--- a/src/adapters/models/distilbert/adapter_model.py
+++ b/src/adapters/models/distilbert/adapter_model.py
@@ -74,7 +74,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/adapters/models/distilbert/modeling_distilbert.py b/src/adapters/models/distilbert/modeling_distilbert.py
index e59aa1ad5..cbd501942 100644
--- a/src/adapters/models/distilbert/modeling_distilbert.py
+++ b/src/adapters/models/distilbert/modeling_distilbert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 """
-PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
-part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
+ part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
 """
 
 
diff --git a/src/adapters/models/electra/adapter_model.py b/src/adapters/models/electra/adapter_model.py
index 57e20fadb..dbccce40d 100644
--- a/src/adapters/models/electra/adapter_model.py
+++ b/src/adapters/models/electra/adapter_model.py
@@ -54,7 +54,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
diff --git a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
index 1572087d9..43178898f 100644
--- a/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/adapters/models/encoder_decoder/modeling_encoder_decoder.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Classes to support Encoder-Decoder architectures"""
+""" Classes to support Encoder-Decoder architectures"""
 
 from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel
 
diff --git a/src/adapters/models/gpt2/adapter_model.py b/src/adapters/models/gpt2/adapter_model.py
index 2cfbdc882..041ab2a18 100644
--- a/src/adapters/models/gpt2/adapter_model.py
+++ b/src/adapters/models/gpt2/adapter_model.py
@@ -65,7 +65,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/adapters/models/gpt2/mixin_gpt2.py b/src/adapters/models/gpt2/mixin_gpt2.py
index 3362fe4dc..d52952130 100644
--- a/src/adapters/models/gpt2/mixin_gpt2.py
+++ b/src/adapters/models/gpt2/mixin_gpt2.py
@@ -60,7 +60,6 @@ def init_adapters(self, model_config, adapters_config):
 
 class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin):
     support_prompt_tuning = False
-    support_lora_delta_w_svd = False
 
     def init_adapters(self, model_config, adapters_config):
         super().init_adapters(model_config, adapters_config)
diff --git a/src/adapters/models/gpt2/modeling_gpt2.py b/src/adapters/models/gpt2/modeling_gpt2.py
index bb6410f83..1c571c23f 100644
--- a/src/adapters/models/gpt2/modeling_gpt2.py
+++ b/src/adapters/models/gpt2/modeling_gpt2.py
@@ -20,16 +20,12 @@
 import torch
 import torch.utils.checkpoint
 
-from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2SdpaAttention
-from transformers.utils import logging
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block
 
 from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_
 from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin
 
 
-logger = logging.get_logger(__name__)
-
-
 class GPT2AttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2Attention):
     def forward(
         self,
@@ -69,10 +65,8 @@ def forward(
         else:
             present = None
 
-        # >>> START AH Changes <<<
         key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask)
         (query,) = adjust_tensors_for_parallel(key, query)
-        # >>> END AH Changes <<<
 
         if self.reorder_and_upcast_attn:
             attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
@@ -90,104 +84,6 @@ def forward(
         return outputs  # a, present, (attentions)
 
 
-class GPT2SdpaAttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2SdpaAttention):
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`GPT2SdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        # Initial attention projections
-        is_cross_attention = encoder_hidden_states is not None
-        if is_cross_attention:
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2SdpaAttention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        # Optional kv caching
-        if layer_past is not None:
-            past_key = layer_past[0]
-            past_value = layer_past[1]
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-        present = None
-        if use_cache is True:
-            present = (key, value)
-
-        # >>> START AH Changes <<<
-        key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask)
-        (query,) = adjust_tensors_for_parallel(key, query)
-        bsz = key.shape[0]
-        # >>> END AH Changes <<<
-
-        # Avoid torch==2.1.2 specific bug for the memory-efficient backend in SDPA
-        if self.require_contiguous_qkv and query.device.type == "cuda" and attention_mask is not None:
-            query = query.contiguous()
-            key = key.contiguous()
-            value = value.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if attention_mask is None and q_len > 1 and not is_cross_attention else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            dropout_p=self.attn_dropout.p if self.training else 0.0,
-            is_causal=is_causal,
-        )
-
-        # Reshape outputs
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.embed_dim)
-
-        # Final projection
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        return attn_output, present, None
-
-
 class GPT2BlockWithAdapters(GPT2DecoderBlockAdaptersMixin, GPT2Block):
     def forward(
         self,
diff --git a/src/adapters/models/gptj/adapter_model.py b/src/adapters/models/gptj/adapter_model.py
index f029f840d..4553ebf2b 100644
--- a/src/adapters/models/gptj/adapter_model.py
+++ b/src/adapters/models/gptj/adapter_model.py
@@ -63,7 +63,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/adapters/models/gptj/modeling_gptj.py b/src/adapters/models/gptj/modeling_gptj.py
index 3880df12c..700e919a1 100644
--- a/src/adapters/models/gptj/modeling_gptj.py
+++ b/src/adapters/models/gptj/modeling_gptj.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch GPT-J model."""
+""" PyTorch GPT-J model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/adapters/models/llama/adapter_model.py b/src/adapters/models/llama/adapter_model.py
index c3116fbe1..1076677ac 100644
--- a/src/adapters/models/llama/adapter_model.py
+++ b/src/adapters/models/llama/adapter_model.py
@@ -17,7 +17,7 @@
 
 @add_start_docstrings(
     """
-The Llama Model that allows the loading of different heads for different tasks. This enables a flexible use of the
+The Llama Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
 models and adpters. Since this class does classification on the last token, it requires to know the position of the
 last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
 token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
@@ -64,7 +64,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/adapters/models/llama/modeling_llama.py b/src/adapters/models/llama/modeling_llama.py
index 461cdde2b..f62091c47 100644
--- a/src/adapters/models/llama/modeling_llama.py
+++ b/src/adapters/models/llama/modeling_llama.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch LLaMA model."""
+""" PyTorch LLaMA model."""
 import math
 import warnings
 from typing import Optional, Tuple
@@ -28,16 +28,8 @@
 from torch import nn
 
 from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
-from transformers.cache_utils import Cache, StaticCache
-from transformers.modeling_flash_attention_utils import _flash_attention_forward
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaFlashAttention2,
-    LlamaSdpaAttention,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
+from transformers.cache_utils import Cache
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, apply_rotary_pos_emb, repeat_kv
 from transformers.utils import logging
 
 from .mixin_llama import LlamaAttentionMixin, LlamaDecoderLayerMixin
@@ -58,7 +50,6 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
@@ -89,23 +80,13 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, position_ids)
-        else:
-            cos, sin = position_embeddings
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+        cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
@@ -116,16 +97,15 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
         (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        bsz = key_states.shape[0]
-        # >>> END AH Changes <<<
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
+        bsz = key_states.shape[0]
+
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
@@ -143,7 +123,7 @@ def forward(
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
-        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
         if self.config.pretraining_tp > 1:
             attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
@@ -158,7 +138,7 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class LlamaFlashAttention2WithAdapters(LlamaAttentionMixin, LlamaFlashAttention2):
+class LlamaFlashAttention2WithAdapters(LlamaAttentionMixin, LlamaAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -168,15 +148,8 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make"
-                " sure to use `sdpa` in the mean time, and open an issue at"
-                " https://github.com/huggingface/transformers"
-            )
-
         output_attentions = False
 
         bsz, q_len, _ = hidden_states.size()
@@ -192,38 +165,27 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, position_ids)
-        else:
-            cos, sin = position_embeddings
+        cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
         (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        # Make adjustments since (parallel) prefix tuning changes the attention mask
+
         bsz = key_states.shape[0]
-        # >>> END AH Changes <<<
+
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
@@ -259,19 +221,11 @@ def forward(
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            sliding_window=getattr(self, "sliding_window", None),
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=self.is_causal,
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
@@ -280,7 +234,7 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class LlamaSdpaAttentionWithAdapters(LlamaAttentionMixin, LlamaSdpaAttention):
+class LlamaSdpaAttentionWithAdapters(LlamaAttentionMixin, LlamaAttention):
 
     # Adapted from LlamaAttention.forward
     def forward(
@@ -292,8 +246,6 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -311,7 +263,6 @@ def forward(
                 output_attentions=output_attentions,
                 use_cache=use_cache,
                 cache_position=cache_position,
-                position_embeddings=position_embeddings,
             )
 
         bsz, q_len, _ = hidden_states.size()
@@ -324,25 +275,17 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, position_ids)
-        else:
-            cos, sin = position_embeddings
+        cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
+        # In case static cache is used, it is an instance attribute.
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
@@ -351,16 +294,15 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
         (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        # >>> END AH Changes <<<
 
         bsz = key_states.shape[0]
 
         causal_mask = attention_mask
+        # if attention_mask is not None and cache_position is not None:
         if attention_mask is not None:
             causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
 
@@ -371,17 +313,12 @@ def forward(
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/adapters/models/mbart/adapter_model.py b/src/adapters/models/mbart/adapter_model.py
index ebbfb45ef..186aef5c0 100644
--- a/src/adapters/models/mbart/adapter_model.py
+++ b/src/adapters/models/mbart/adapter_model.py
@@ -68,7 +68,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
@@ -136,7 +136,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs,
+        **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
diff --git a/src/adapters/models/mbart/modeling_mbart.py b/src/adapters/models/mbart/modeling_mbart.py
index 45bdceae2..0f8f0d533 100644
--- a/src/adapters/models/mbart/modeling_mbart.py
+++ b/src/adapters/models/mbart/modeling_mbart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch MBART model."""
+""" PyTorch MBART model."""
 from typing import Optional, Tuple
 
 import torch
diff --git a/src/adapters/models/mistral/adapter_model.py b/src/adapters/models/mistral/adapter_model.py
index 1909fccde..3897c377d 100644
--- a/src/adapters/models/mistral/adapter_model.py
+++ b/src/adapters/models/mistral/adapter_model.py
@@ -1,5 +1,5 @@
 import logging
-
+from typing import Optional
 import torch
 
 from transformers.models.mistral.modeling_mistral import MISTRAL_START_DOCSTRING, MistralModel, MistralPreTrainedModel
@@ -16,7 +16,7 @@
 
 @add_start_docstrings(
     """
-The Mistal Model that allows the loading of different heads for different tasks. This enables a flexible use of the
+The Mistal Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
 models and adpters. Since this class does classification on the last token, it requires to know the position of the
 last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
 token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
@@ -56,13 +56,14 @@ def forward(
         past_key_values=None,
         inputs_embeds=None,
         use_cache=None,
+        cache_position: Optional[torch.LongTensor] = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -77,6 +78,7 @@ def forward(
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
+            cache_position=cache_position,
             output_attentions=output_attentions,
             return_dict=return_dict,
             output_hidden_states=output_hidden_states,
diff --git a/src/adapters/models/mistral/mixin_mistral.py b/src/adapters/models/mistral/mixin_mistral.py
index 9acd17995..09c810e7f 100644
--- a/src/adapters/models/mistral/mixin_mistral.py
+++ b/src/adapters/models/mistral/mixin_mistral.py
@@ -6,6 +6,7 @@
 from ...methods.lora import LoRALinear
 from ...methods.prefix_tuning import PrefixTuningLayer
 from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin
+from ...utils import patch_forward
 
 
 class MistralAttentionMixin:
@@ -16,6 +17,8 @@ def init_adapters(self, model_config, adapters_config):
 
         self.prefix_tuning = PrefixTuningLayer("self_prefix", model_config, adapters_config)
 
+        patch_forward(self)
+
 
 class MistralDecoderLayerMixin:
     def init_adapters(self, model_config, adapters_config):
@@ -26,6 +29,8 @@ def init_adapters(self, model_config, adapters_config):
         self.attention_adapters = BottleneckLayer("mh_adapter")
         self.output_adapters = BottleneckLayer("output_adapter")
 
+        patch_forward(self)
+
 
 class MistralModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin):
     support_prompt_tuning = False
diff --git a/src/adapters/models/mistral/modeling_mistral.py b/src/adapters/models/mistral/modeling_mistral.py
index 00e020515..900d831e2 100644
--- a/src/adapters/models/mistral/modeling_mistral.py
+++ b/src/adapters/models/mistral/modeling_mistral.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Mistral model."""
+""" PyTorch Mistral model."""
+import inspect
 import math
 from typing import Optional, Tuple
 
@@ -25,17 +26,11 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from adapters.composition import (
-    adjust_tensors_for_parallel,
-    adjust_tensors_for_parallel_,
-    match_attn_matrices_for_parallel,
-)
+from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
 from transformers.cache_utils import Cache, StaticCache
 from transformers.models.mistral.modeling_mistral import (
     MistralAttention,
     MistralDecoderLayer,
-    MistralFlashAttention2,
-    MistralSdpaAttention,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -43,15 +38,18 @@
 
 from .mixin_mistral import MistralAttentionMixin, MistralDecoderLayerMixin
 
-
 if is_flash_attn_2_available():
-    from transformers.models.mistral.modeling_mistral import _flash_supports_window_size
+    from flash_attn import flash_attn_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 logger = logging.get_logger(__name__)
 
 
 class MistralAttentionWithAdapters(MistralAttentionMixin, MistralAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -72,13 +70,12 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
+        past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -90,17 +87,15 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
         (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        # Make adjustments since (parallel) prefix tuning changes the attention mask
-        bsz = key_states.shape[0]
-        # >>> END AH Changes <<<
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
+        bsz = key_states.shape[0]
+
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
@@ -127,7 +122,13 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class MistralFlashAttention2WithAdapters(MistralAttentionMixin, MistralFlashAttention2):
+class MistralFlashAttention2WithAdapters(MistralAttentionMixin, MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -140,11 +141,9 @@ def forward(
     ):
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
-                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make"
-                " sure to use `sdpa` in the mean time, and open an issue at"
-                " https://github.com/huggingface/transformers"
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
             )
-
         output_attentions = False
 
         bsz, q_len, _ = hidden_states.size()
@@ -157,12 +156,10 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -171,6 +168,13 @@ def forward(
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
+        key_states, value_states, attention_mask = self.prefix_tuning(
+            key_states, value_states, hidden_states, attention_mask
+        )
+        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
+
+        bsz = key_states.shape[0]
+
         use_sliding_windows = (
             _flash_supports_window_size
             and getattr(self.config, "sliding_window", None) is not None
@@ -179,10 +183,12 @@ def forward(
 
         if not _flash_supports_window_size:
             logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory"
-                " efficient implementation make sure to upgrade flash-attn library."
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
             )
 
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
             cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
@@ -201,8 +207,8 @@ def forward(
 
                 if past_key.shape[-2] != self.config.sliding_window - 1:
                     raise ValueError(
-                        "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1,"
-                        f" head_dim`), got {past_key.shape}"
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
                     )
 
                 if attention_mask is not None:
@@ -217,16 +223,6 @@ def forward(
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
 
-        # >>> START AH Changes <<<
-        key_states, value_states, attention_mask = self.prefix_tuning(
-            key_states, value_states, hidden_states, attention_mask
-        )
-        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        # Make adjustments since (parallel) prefix tuning changes the attention mask
-        kv_seq_len = key_states.shape[-2]
-        bsz = key_states.shape[0]
-        # >>> END AH Changes <<<
-
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
@@ -241,8 +237,8 @@ def forward(
                 target_dtype = self.q_proj.weight.dtype
 
             logger.warning_once(
-                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
-                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                 f" {target_dtype}."
             )
 
@@ -274,7 +270,15 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class MistralSdpaAttentionWithAdapters(MistralAttentionMixin, MistralSdpaAttention):
+# Adapted from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+class MistralSdpaAttentionWithAdapters(MistralAttentionMixin, MistralAttention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -289,10 +293,8 @@ def forward(
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention`"
-                " does not support `output_attentions=True`. Falling back to the manual attention implementation, but"
-                " specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This"
-                ' warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
                 hidden_states=hidden_states,
@@ -314,12 +316,10 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        # >>> START AH Changes <<<
         query_states, key_states, value_states = match_attn_matrices_for_parallel(
             query_states, key_states, value_states
         )
         (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-        # >>> END AH Changes <<<
 
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -332,14 +332,10 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        # >>> START AH Changes <<<
         key_states, value_states, attention_mask = self.prefix_tuning(
             key_states, value_states, hidden_states, attention_mask
         )
         (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        # Make adjustments since (parallel) prefix tuning changes the attention mask
-        bsz = key_states.shape[0]
-        # >>> END AH Changes <<<
 
         causal_mask = attention_mask
         if attention_mask is not None:
@@ -379,7 +375,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
@@ -403,9 +399,7 @@ def forward(
             kwargs (`dict`, *optional*):
                 Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                 into the model
-        """
-
-        adjust_tensors_for_parallel_(hidden_states, attention_mask, position_ids)
+        """  # adjust_tensors_for_parallel(hidden_states, attention_mask, position_ids)
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
diff --git a/src/adapters/models/mt5/adapter_model.py b/src/adapters/models/mt5/adapter_model.py
index 418b47b13..2868aec3e 100644
--- a/src/adapters/models/mt5/adapter_model.py
+++ b/src/adapters/models/mt5/adapter_model.py
@@ -81,7 +81,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -161,7 +161,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs,
+        **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
diff --git a/src/adapters/models/mt5/modeling_mt5.py b/src/adapters/models/mt5/modeling_mt5.py
index b982d34d6..12ad630a7 100644
--- a/src/adapters/models/mt5/modeling_mt5.py
+++ b/src/adapters/models/mt5/modeling_mt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch MT5 model."""
+""" PyTorch MT5 model."""
 
 import torch
 from torch import nn
diff --git a/src/adapters/models/plbart/__init__.py b/src/adapters/models/plbart/__init__.py
deleted file mode 100644
index 1160ba151..000000000
--- a/src/adapters/models/plbart/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The Adapter-Hub Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from transformers.utils import _LazyModule
-
-
-_import_structure = {
-    "adapter_model": ["PLBartAdapterModel"],
-}
-
-
-if TYPE_CHECKING:
-    from .adapter_model import PLBartAdapterModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-    )
diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py
deleted file mode 100644
index 2aaaf0b9f..000000000
--- a/src/adapters/models/plbart/adapter_model.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import torch
-
-from transformers.models.plbart.modeling_plbart import (
-    PLBART_INPUTS_DOCSTRING,
-    PLBART_START_DOCSTRING,
-    PLBartConfig,
-    PLBartModel,
-    PLBartPreTrainedModel,
-    shift_tokens_right,
-)
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
-
-from ...heads import ModelWithFlexibleHeadsAdaptersMixin
-from ...model_mixin import EmbeddingAdaptersWrapperMixin
-from ...wrappers import init
-
-
-@add_start_docstrings(
-    "PLBART Model with the option to add multiple flexible prediction heads on top.", PLBART_START_DOCSTRING
-)
-class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPreTrainedModel):
-    _tied_weights_keys = [
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
-
-    head_types = [
-        "classification",
-        "multilabel_classification",
-        "question_answering",
-        "seq2seq_lm",
-    ]
-
-    def __init__(self, config: PLBartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = PLBartModel(config)
-        init(self.model)
-
-        self._init_head_modules()
-
-        self.post_init()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        past_key_values=None,
-        head=None,
-        output_adapter_gating_scores=False,
-        output_adapter_fusion_attentions=False,
-        **kwargs,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
-            use_cache = False
-
-        outputs, context = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            past_key_values=past_key_values,
-            output_adapter_gating_scores=output_adapter_gating_scores,
-            output_adapter_fusion_attentions=output_adapter_fusion_attentions,
-            adapter_input_parallelized=kwargs.pop("adapter_input_parallelized", False),
-            output_context=True,
-        )
-        # required e.g. for prompt tuning in all models
-        kwargs["context"] = context
-
-        head_outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            get_cls_from_eos_tokens=True,
-            # `get_cls_from_eos_tokens` requires passing eos mask
-            eos_mask=input_ids.eq(self.config.eos_token_id) if input_ids is not None else None,
-            **kwargs,
-        )
-
-        return head_outputs
-
-    # Copied from PLBartForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-            "adapter_input_parallelized": kwargs.pop("adapter_input_parallelized", False),
-        }
-
-    # Copied from PLBartForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id)  # , self.config.decoder_start_token_id)
-
-    # Copied from PLBartForConditionalGeneration
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
diff --git a/src/adapters/models/plbart/mixin_plbart.py b/src/adapters/models/plbart/mixin_plbart.py
deleted file mode 100644
index bd02e04de..000000000
--- a/src/adapters/models/plbart/mixin_plbart.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from typing import Iterable, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from ...composition import adjust_tensors_for_parallel
-from ...methods.bottleneck import BottleneckLayer
-from ...methods.lora import LoRALinear
-from ...methods.prefix_tuning import PrefixTuningLayer
-from ...model_mixin import (
-    EmbeddingAdaptersMixin,
-    EmbeddingAdaptersWrapperMixin,
-    InvertibleAdaptersMixin,
-    InvertibleAdaptersWrapperMixin,
-    ModelBaseAdaptersMixin,
-)
-
-
-class PLBartAttentionAdaptersMixin:
-    """Adds adapters to the BartAttention module."""
-
-    def init_adapters(self, model_config, adapters_config):
-        # Wrap layers for LoRA
-        self.k_proj = LoRALinear.wrap(self.k_proj, "selfattn", model_config, adapters_config, attn_key="k")
-        self.v_proj = LoRALinear.wrap(self.v_proj, "selfattn", model_config, adapters_config, attn_key="v")
-        self.q_proj = LoRALinear.wrap(self.q_proj, "selfattn", model_config, adapters_config, attn_key="q")
-
-        self.prefix_tuning = PrefixTuningLayer(
-            self.location_key + "_prefix" if self.location_key else None, model_config, adapters_config
-        )
-
-
-class PLBartEncoderLayerAdaptersMixin:
-    """Adds adapters to the PLBartEncoderLayer module of PLBART."""
-
-    def init_adapters(self, model_config, adapters_config):
-        self.adapters_config = adapters_config
-        # Wrap layers for LoRA
-        self.fc1 = LoRALinear.wrap(self.fc1, "intermediate", model_config, adapters_config)
-        self.fc2 = LoRALinear.wrap(self.fc2, "output", model_config, adapters_config)
-
-        # Set attention layer location key for prefix tuning
-        self.self_attn.location_key = "encoder"
-        self.attention_adapters = BottleneckLayer("mh_adapter")
-        self.output_adapters = BottleneckLayer("output_adapter")
-
-
-class PLBartDecoderLayerAdaptersMixin(PLBartEncoderLayerAdaptersMixin):
-    """Adds adapters to the PLBartDecoderLayer module of PLBART."""
-
-    def init_adapters(self, model_config, adapters_config):
-        super().init_adapters(model_config, adapters_config)
-        # Set attention layer location key for prefix tuning
-        self.self_attn.location_key = "self"
-        self.encoder_attn.location_key = "cross"
-        self.cross_attention_adapters = BottleneckLayer("cross_adapter")
-
-
-class PLBartEncoderAdaptersMixin(InvertibleAdaptersMixin):
-    """Adds adapters to the PLBartEncoder module of PLBART."""
-
-    pass
-
-
-class PLBartDecoderAdaptersMixin:
-    """Adds adapters to the PLBartDecoder module of PLBART."""
-
-    def forward(
-        self, input_ids: torch.LongTensor = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, **kwargs
-    ):
-        (input_ids,) = adjust_tensors_for_parallel(encoder_hidden_states, input_ids)
-        return super().forward(input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, **kwargs)
-
-
-class PLBartModelAdaptersMixin(EmbeddingAdaptersMixin, InvertibleAdaptersWrapperMixin, ModelBaseAdaptersMixin):
-    """Adds adapters to the PLBartModel class."""
-
-    invertible_adapters_base_name = "encoder"
-    support_prompt_tuning = False
-
-    def init_adapters(self, model_config, adapters_config):
-        super().init_adapters(model_config, adapters_config)
-        self.encoder.layernorm_embedding.register_forward_hook(self.post_embedding_forward)
-
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        if hasattr(self, "encoder"):
-            for i, layer in enumerate(self.encoder.layers):
-                yield i, layer
-            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
-                yield i, layer
-        else:
-            for i, layer in enumerate(self.decoder.layers):
-                yield i, layer
-
-    def post_embedding_forward(self, module, args, embedding_output):
-        embedding_output = self.invertible_adapters_forward(embedding_output)
-        # Prompt tuning not yet supported
-        return embedding_output
-
-
-class PLBartDecoderWrapperAdaptersMixin(EmbeddingAdaptersWrapperMixin, ModelBaseAdaptersMixin):
-    """Adds adapters to the PLBartDecoderWrapper class."""
-
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.decoder.layers):
-            yield i, layer
-
-    def get_input_embeddings(self):
-        return self.decoder.get_input_embeddings()
diff --git a/src/adapters/models/plbart/modeling_plbart.py b/src/adapters/models/plbart/modeling_plbart.py
deleted file mode 100644
index 2d812cae1..000000000
--- a/src/adapters/models/plbart/modeling_plbart.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch PLBART model."""
-from typing import Optional, Tuple
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-
-from transformers.models.plbart.modeling_plbart import PLBartAttention, PLBartDecoderLayer, PLBartEncoderLayer
-from transformers.utils import logging
-
-from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel
-from .mixin_plbart import (
-    PLBartAttentionAdaptersMixin,
-    PLBartDecoderLayerAdaptersMixin,
-    PLBartEncoderLayerAdaptersMixin,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-class PLBartAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        query_states, key_states, value_states = match_attn_matrices_for_parallel(
-            query_states, key_states, value_states
-        )
-        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        key_states, value_states, attention_mask = self.prefix_tuning(
-            key_states, value_states, hidden_states, attention_mask
-        )
-        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        bsz = query_states.size(0)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.reshape(*proj_shape)
-        value_states = value_states.reshape(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped, past_key_value
-
-
-class PLBartFlashAttention2WithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # PLBartFlashAttention2 attention does not support output_attentions
-        if output_attentions:
-            raise ValueError("PLBartFlashAttention2 attention does not support output_attentions")
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, q_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0].transpose(1, 2)
-            value_states = past_key_value[1].transpose(1, 2)
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
-        else:
-            # self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-
-        query_states, key_states, value_states = match_attn_matrices_for_parallel(
-            query_states, key_states, value_states
-        )
-        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
-
-        key_states, value_states, attention_mask = self.prefix_tuning(
-            key_states, value_states, hidden_states, attention_mask
-        )
-        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        bsz = query_states.size(0)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
-                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, -1)
-        attn_output = self.out_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class PLBartSdpaAttentionWithAdapters(PLBartAttentionAdaptersMixin, PLBartAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "PLBartModel is using PLBartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does"
-                " not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual"
-                " attention implementation, but specifying the manual implementation will be required from"
-                " Transformers version v5.0.0 onwards. This warning can be removed using the argument"
-                ' `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states,
-                key_value_states=key_value_states,
-                past_key_value=past_key_value,
-                attention_mask=attention_mask,
-                layer_head_mask=layer_head_mask,
-                output_attentions=output_attentions,
-            )
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        query_states, key_states, value_states = match_attn_matrices_for_parallel(
-            query_states, key_states, value_states
-        )
-        (attention_mask,) = adjust_tensors_for_parallel(query_states, attention_mask)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        key_states, value_states, attention_mask = self.prefix_tuning(
-            key_states, value_states, hidden_states, attention_mask
-        )
-        (query_states,) = adjust_tensors_for_parallel(key_states, query_states)
-        bsz = query_states.size(0)
-
-        query_states = self._shape(query_states, tgt_len, bsz)
-
-        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
-        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
-        )
-
-        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2)
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-class PLBartEncoderLayerWithAdapters(PLBartEncoderLayerAdaptersMixin, PLBartEncoderLayer):
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: torch.FloatTensor,
-        layer_head_mask: torch.FloatTensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        adjust_tensors_for_parallel_(hidden_states, attention_mask)
-
-        residual = hidden_states
-        hidden_states, attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm)
-
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm)
-
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-class PLBartDecoderLayerWithAdapters(PLBartDecoderLayerAdaptersMixin, PLBartDecoderLayer):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = True,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size `(decoder_attention_heads,)`.
-            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        adjust_tensors_for_parallel_(hidden_states, attention_mask, encoder_attention_mask)
-
-        residual = hidden_states
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = self.attention_adapters(hidden_states, residual, self.self_attn_layer_norm)
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = self.cross_attention_adapters(hidden_states, residual, self.encoder_attn_layer_norm)
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = self.output_adapters(hidden_states, residual, self.final_layer_norm)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
diff --git a/src/adapters/models/roberta/adapter_model.py b/src/adapters/models/roberta/adapter_model.py
index ab9411ef7..87858566b 100644
--- a/src/adapters/models/roberta/adapter_model.py
+++ b/src/adapters/models/roberta/adapter_model.py
@@ -53,7 +53,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py
index 5aa7aff4f..b544252ce 100644
--- a/src/adapters/models/t5/adapter_model.py
+++ b/src/adapters/models/t5/adapter_model.py
@@ -74,7 +74,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -154,7 +154,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs,
+        **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py
index c98cfa477..03d9f2797 100644
--- a/src/adapters/models/t5/modeling_t5.py
+++ b/src/adapters/models/t5/modeling_t5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch T5 model."""
+""" PyTorch T5 model."""
 
 import torch
 from torch import nn
diff --git a/src/adapters/models/vit/modeling_vit.py b/src/adapters/models/vit/modeling_vit.py
index 323fb6cab..f8c02bd93 100644
--- a/src/adapters/models/vit/modeling_vit.py
+++ b/src/adapters/models/vit/modeling_vit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch ViT model."""
+""" PyTorch ViT model."""
 
 
 import math
@@ -23,7 +23,7 @@
 from torch import nn
 
 from adapters.composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
-from transformers.models.vit.modeling_vit import ViTLayer, ViTOutput, ViTSdpaSelfAttention, ViTSelfAttention
+from transformers.models.vit.modeling_vit import ViTLayer, ViTOutput, ViTSelfAttention
 
 from .mixin_vit import ViTLayerAdaptersMixin, ViTOutputAdaptersMixin, ViTSelfAttentionAdaptersMixin
 
@@ -70,38 +70,6 @@ def forward(
         return outputs
 
 
-class ViTSdpaSelfAttentionWithAdapters(ViTSelfAttentionAdaptersMixin, ViTSdpaSelfAttention):
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer)
-
-        key_layer, value_layer, _ = self.prefix_tuning(key_layer, value_layer, hidden_states)
-        (query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer)
-
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
-            query_layer,
-            key_layer,
-            value_layer,
-            head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
-        )
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        return context_layer, None
-
-
 class ViTOutputWithAdapters(ViTOutputAdaptersMixin, ViTOutput):
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dense(hidden_states)
diff --git a/src/adapters/models/xlm_roberta/adapter_model.py b/src/adapters/models/xlm_roberta/adapter_model.py
index 1cab4aaac..8acfde792 100644
--- a/src/adapters/models/xlm_roberta/adapter_model.py
+++ b/src/adapters/models/xlm_roberta/adapter_model.py
@@ -56,7 +56,7 @@ def forward(
         head=None,
         output_adapter_gating_scores=False,
         output_adapter_fusion_attentions=False,
-        **kwargs,
+        **kwargs
     ):
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
diff --git a/src/adapters/models/xmod/adapter_model.py b/src/adapters/models/xmod/adapter_model.py
index a179fc6be..94cc43f71 100644
--- a/src/adapters/models/xmod/adapter_model.py
+++ b/src/adapters/models/xmod/adapter_model.py
@@ -59,7 +59,7 @@ def forward(
         head: Optional[str] = None,
         output_adapter_gating_scores: Optional[bool] = False,
         output_adapter_fusion_attentions: Optional[bool] = False,
-        **kwargs,
+        **kwargs
     ):
         # Flatten for multiple choice tasks
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
diff --git a/src/adapters/trainer.py b/src/adapters/trainer.py
deleted file mode 100644
index 6be5b3ee7..000000000
--- a/src/adapters/trainer.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import os
-import re
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.utils.data.dataset import Dataset
-
-from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__
-from transformers.configuration_utils import PretrainedConfig
-from transformers.data.data_collator import DataCollator
-from transformers.modeling_utils import unwrap_model
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
-from transformers.trainer_utils import EvalPrediction
-from transformers.training_args import TrainingArguments
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled, logging
-
-from .composition import AdapterCompositionBlock, Fuse
-
-
-if is_sagemaker_mp_enabled():
-    import smdistributed.modelparallel.torch as smp
-
-
-logger = logging.get_logger(__name__)
-
-
-class AdapterTrainer(Trainer):
-    def __init__(
-        self,
-        model: Union[PreTrainedModel, nn.Module] = None,
-        args: TrainingArguments = None,
-        data_collator: Optional[DataCollator] = None,
-        train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Dataset] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        model_init: Callable[[], PreTrainedModel] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        adapter_names: Optional[List[List[str]]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
-    ):
-        super().__init__(
-            model,
-            args,
-            data_collator,
-            train_dataset,
-            eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=[AdapterTrainerCallback(self)] + callbacks if callbacks else [AdapterTrainerCallback(self)],
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        if adapter_names is not None:
-            self.model.set_active_adapters(adapter_names)
-        # Set the defaults for loading/ saving model & adapters
-        if isinstance(self.model, PreTrainedModel):
-            model_frozen = getattr(self.model.base_model, "model_frozen", False)
-        else:
-            model_frozen = False
-        if model_frozen and self.model.active_adapters:
-            # Check if training AdapterFusion
-            self.train_adapter_fusion = (
-                isinstance(self.model.active_adapters, Fuse)
-                or isinstance(self.model.active_adapters, AdapterCompositionBlock)
-                and any([isinstance(child, Fuse) for child in self.model.active_adapters.children])
-            )
-        if self.model.active_adapters is None:
-            raise ValueError(
-                "Expected a model with an active adapter setup."
-                "If you want to fully finetune the model use the Trainer class."
-            )
-        if (self.label_names is None or len(self.label_names) < 1) and self.model.active_head is not None:
-            all_label_names = set()
-            for head in self.model._active_heads:
-                all_label_names |= set(self.model.heads[head].get_label_names())
-            self.label_names = list(all_label_names)
-
-    def create_optimizer(self):
-        """
-        Setup the optimizer.
-
-        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
-        """
-        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-
-        if self.optimizer is None:
-            decay_parameters = self.get_decay_parameter_names(opt_model)
-            if hasattr(self.model, "config") and hasattr(self.model.config, "adapters"):
-                match_str = r"adapter_fusion_layer\..*\.value"
-                decay_parameters = [name for name in decay_parameters if not re.match(match_str, name)]
-            optimizer_grouped_parameters = [
-                {
-                    "params": [
-                        p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
-                    ],
-                    "weight_decay": self.args.weight_decay,
-                },
-                {
-                    "params": [
-                        p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
-                    ],
-                    "weight_decay": 0.0,
-                },
-            ]
-
-            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
-            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-
-        if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(self.optimizer)
-
-        return self.optimizer
-
-    def _save(self, output_dir: Optional[str] = None, state_dict=None):
-        # If we are executing this function, we are the process zero, so we don't check for that.
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        os.makedirs(output_dir, exist_ok=True)
-        logger.info(f"Saving model checkpoint to {output_dir}")
-        # Save a trained model and configuration using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        if not isinstance(self.model, PreTrainedModel):
-            if isinstance(unwrap_model(self.model), PreTrainedModel):
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict)
-            else:
-                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
-        else:
-            self.model.save_all_adapters(output_dir)
-            if self.train_adapter_fusion:
-                self.model.save_all_adapter_fusions(output_dir)
-            if hasattr(self.model, "heads"):
-                self.model.save_all_heads(output_dir)
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
-
-    def _load_from_checkpoint(self, resume_from_checkpoint):
-        args = self.args
-        if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
-            logger.info(f"Loading model from {resume_from_checkpoint}).")
-
-        if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)):
-            config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME))
-            checkpoint_version = config.transformers_version
-            if checkpoint_version is not None and checkpoint_version != __version__:
-                logger.warn(
-                    f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
-                    f"Transformers but your current version is {__version__}. This is not recommended and could "
-                    "yield to errors or unwanted behaviors."
-                )
-
-        if args.deepspeed:
-            # will be resumed in deepspeed_init
-            pass
-        else:
-            adapter_loaded = False
-            if os.path.isdir(resume_from_checkpoint):
-                adapter_loaded = self._load_adapters(resume_from_checkpoint)
-                self._load_adapter_fusions(resume_from_checkpoint)
-                # Save all heads for a model with heads
-                if hasattr(self.model, "heads"):
-                    self._load_heads(resume_from_checkpoint)
-
-            if not adapter_loaded:
-                raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint))
-
-    def _load_adapters(self, resume_from_checkpoint):
-        adapter_loaded = False
-        for file_name in os.listdir(resume_from_checkpoint):
-            if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)):
-                if "," not in file_name and "adapter_config.json" in os.listdir(
-                    os.path.join(resume_from_checkpoint, file_name)
-                ):
-                    self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name)))
-                    adapter_loaded = True
-        return adapter_loaded
-
-    def _load_adapter_fusions(self, resume_from_checkpoint):
-        for file_name in os.listdir(resume_from_checkpoint):
-            if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)):
-                if "," in file_name:
-                    self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name))
-
-    def _load_heads(self, resume_from_checkpoint):
-        for file_name in os.listdir(resume_from_checkpoint):
-            if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)):
-                if "," not in file_name and "head_config.json" in os.listdir(
-                    os.path.join(resume_from_checkpoint, file_name)
-                ):
-                    self.model.load_head(os.path.join(resume_from_checkpoint, file_name))
-
-    def _load_best_model(self):
-        model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-        logger.info(
-            f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
-        )
-        # attempt to re-load all adapters from checkpoint
-        for adapter in model.adapters_config.adapters:
-            adapter_dir = os.path.join(self.state.best_model_checkpoint, adapter)
-            if os.path.exists(adapter_dir):
-                model.load_adapter(adapter_dir)
-                model.adapter_to(adapter, device=self.args.device)
-        if self.train_adapter_fusion:
-            logger.info(
-                f"Loading best adapter fusion(s) from {self.state.best_model_checkpoint} (score:"
-                f" {self.state.best_metric})."
-            )
-            # attempt to re-load all adapter fusions from checkpoint
-            for fusion in model.adapters_config.fusions:
-                fusion_dir = os.path.join(self.state.best_model_checkpoint, fusion)
-                if os.path.exists(fusion_dir):
-                    model.load_adapter_fusion(fusion_dir)
-                    model.adapter_fusion_to(fusion, device=self.args.device)
-
-
-class AdapterTrainerCallback(TrainerCallback):
-    def __init__(self, trainer):
-        super().__init__()
-        self.trainer = trainer
-
-    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
-        model = kwargs.pop("model")
-        model_frozen = getattr(model.base_model, "model_frozen", False)
-        if not model_frozen:
-            raise ValueError(
-                "The pre-trained model weights are not frozen. For training adapters, please call the train_adapter()"
-                " method"
-            )
-
-    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
-        # apply adapter fusion weight regularization on the value matrix
-        model = kwargs.pop("model")
-        if self.trainer.train_adapter_fusion:
-            fusion_reg_loss = model.base_model.get_fusion_regularization_loss()
-            if fusion_reg_loss is not None:
-                fusion_reg_loss.backward()
-
-
-class Seq2SeqAdapterTrainer(AdapterTrainer, Seq2SeqTrainer):
-    pass
diff --git a/src/adapters/training.py b/src/adapters/training 2.py
similarity index 98%
rename from src/adapters/training.py
rename to src/adapters/training 2.py
index 5d053affb..831601139 100644
--- a/src/adapters/training.py
+++ b/src/adapters/training 2.py	
@@ -83,7 +83,7 @@ def setup_adapter_training(
         else:
             lang_adapter_name = None
         # Freeze all model weights except of those of this adapter
-        model.train_adapter(adapter_name)
+        model.train_adapter([adapter_name])
         # Set the adapters to be used in every forward pass
         if lang_adapter_name:
             model.set_active_adapters(Stack(lang_adapter_name, adapter_name))
diff --git a/src/adapters/utils.py b/src/adapters/utils 2.py
similarity index 77%
rename from src/adapters/utils.py
rename to src/adapters/utils 2.py
index b85537f63..7338f4c3a 100644
--- a/src/adapters/utils.py
+++ b/src/adapters/utils 2.py	
@@ -18,6 +18,7 @@
 from os.path import basename, isdir, isfile, join
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
 import torch
@@ -110,11 +111,11 @@ def __repr__(self):
 @dataclass
 class AdapterInfo:
     """
-    Holds information about an adapter publicly available on the Hub. Returned by
+    Holds information about an adapter publicly available on AdapterHub or huggingface.co. Returned by
     :func:`list_adapters()`.
 
     Args:
-        source (str): The source repository of this adapter. Always 'hf' for adapters available on HF Model Hub.
+        source (str): The source repository of this adapter. Can be either "ah" (AdapterHub) or "hf" (huggingface.co).
         adapter_id (str): The unique identifier of this adapter.
         model_name (str, optional): The identifier of the model this adapter was trained for.
         task (str, optional): The task this adapter was trained for.
@@ -140,16 +141,14 @@ def _minimize_dict(d):
         return d
 
 
-def get_adapter_config_hash(config, length=16, ignore_params=[]):
+def get_adapter_config_hash(config, length=16):
     """
     Calculates the hash of a given adapter configuration which is used to identify this configuration.
 
     Returns:
         str: The resulting hash of the given config dict.
     """
-    minimized_config = _minimize_dict(
-        {k: v for (k, v) in config.items() if k not in ADAPTER_CONFIG_HASH_IGNORE + ignore_params}
-    )
+    minimized_config = _minimize_dict({k: v for (k, v) in config.items() if k not in ADAPTER_CONFIG_HASH_IGNORE})
     # ensure hash is kept consistent to previous versions
     for name, default in ADAPTER_CONFIG_HASH_IGNORE_DEFAULT.items():
         if minimized_config.get(name, None) == default:
@@ -435,7 +434,7 @@ def parse_adapter_config_string(config_string: str) -> List[Tuple[str, dict]]:
     return adapter_configs
 
 
-def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) -> dict:
+def resolve_adapter_config(config: Union[dict, str], local_map=None, try_loading_from_hub=True, **kwargs) -> dict:
     """
     Resolves a given adapter configuration specifier to a full configuration dictionary.
 
@@ -445,6 +444,7 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) -
             - a dictionary: returned without further action
             - an identifier string available in local_map
             - the path to a file containing a full adapter configuration
+            - an identifier string available in Adapter-Hub
 
     Returns:
         dict: The resolved adapter configuration dictionary.
@@ -464,6 +464,13 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) -
                 return loaded_config["config"]
             else:
                 return loaded_config
+    # download hub index file
+    if try_loading_from_hub:
+        index_file = download_cached(ADAPTER_HUB_CONFIG_FILE, **kwargs)
+        if not index_file:
+            raise EnvironmentError("Unable to load adapter hub index file. The file might be temporarily unavailable.")
+        with open(index_file, "r") as f:
+            config_index = json.load(f)
     # parse the config string
     config_pairs = parse_adapter_config_string(config)
     if len(config_pairs) > 0:
@@ -473,6 +480,11 @@ def resolve_adapter_config(config: Union[dict, str], local_map=None, **kwargs) -
             if local_map and name in local_map:
                 config_obj = local_map[name]
                 full_configs.append(config_obj.replace(**config_kwargs))
+            # now, try to find in hub index
+            elif try_loading_from_hub and name in config_index:
+                config_obj = config_index[name]
+                config_obj.update(**config_kwargs)
+                full_configs.append(config_obj)
             else:
                 raise ValueError("Could not identify '{}' as a valid adapter configuration.".format(name))
         # Case 1: only one config, return it directly
@@ -576,16 +588,34 @@ def _get_matching_version(config_entry, org):
         raise ValueError("Multiple adapters with this name are available for this config.")
 
 
+def http_get_json(url):
+    # check if it's a relative url
+    if not urlparse(url).netloc:
+        url = urljoin(ADAPTER_HUB_URL, url)
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        raise EnvironmentError("Failed to get file {}".format(url))
+
+
+def get_checksum(file_entry: dict):
+    for algo in hashlib.algorithms_guaranteed:
+        if algo in file_entry:
+            return algo, file_entry[algo]
+
+
 def pull_from_hub(
     specifier: str,
     model_name: str,
     adapter_config: Optional[Union[dict, str]] = None,
     version: str = None,
     strict: bool = False,
-    **kwargs,
+    redirect_to_hf_hub: bool = False,
+    **kwargs
 ) -> str:
     """
-    Redirects loading from the archived Hub repository to HuggingFace Model Hub.
+    Downloads a pre-trained adapter module from Adapter-Hub
 
     Args:
         specifier (str): A string specifying the adapter to be loaded.
@@ -594,6 +624,9 @@ def pull_from_hub(
         version (str, optional): The version of the adapter to be loaded. Defaults to None.
         strict (bool, optional):
             If set to True, only allow adapters exactly matching the given config to be loaded. Defaults to False.
+        redirect_to_hf_hub (bool, optional):
+            If set to True, the function will redirect to the HuggingFace Model Hub instead of AdapterHub.
+            Defaults to False.
 
     Returns:
         str: The local path to which the adapter has been downloaded.
@@ -609,12 +642,35 @@ def pull_from_hub(
         raise EnvironmentError("No adapter with name '{}' was found in the adapter index.".format(specifier))
 
     hf_hub_specifier = "AdapterHub/" + os.path.basename(hub_entry_url).split(".")[0]
-    logger.warning(
-        "Automatic redirect to HF Model Hub repo '{}'. Please switch to the new ID to remove this warning.".format(
-            hf_hub_specifier
+    if redirect_to_hf_hub:
+        logger.warning(
+            "Automatic redirect to HF Model Hub repo '{}'. Please switch to the new ID to remove this warning.".format(
+                hf_hub_specifier
+            )
+        )
+        return pull_from_hf_model_hub(hf_hub_specifier, version=version, **kwargs)
+    else:
+        logger.warning(
+            "Loading adapters from this source is deprecated. This adapter has moved to '{}'. Please switch to the new"
+            " ID to remove this warning.".format(hf_hub_specifier)
         )
-    )
-    return pull_from_hf_model_hub(hf_hub_specifier, version=version, **kwargs)
+
+    hub_entry = http_get_json(hub_entry_url)
+    # set version
+    if not version:
+        version = hub_entry["default_version"]
+    elif version not in hub_entry["files"]:
+        logger.warning("Version '{}' of adapter '{}' not found. Falling back to default.".format(version, specifier))
+        version = hub_entry["default_version"]
+    file_entry = hub_entry["files"][version]
+
+    # start downloading
+    logger.info("Resolved adapter files at {}.".format(file_entry["url"]))
+    checksum_algo, checksum = get_checksum(file_entry)
+    download_path = download_cached(file_entry["url"], checksum=checksum, checksum_algo=checksum_algo, **kwargs)
+    if not download_path:
+        raise EnvironmentError("Unable to load file from {}. The file might be unavailable.".format(file_entry["url"]))
+    return download_path
 
 
 def pull_from_hf_model_hub(specifier: str, version: str = None, **kwargs) -> str:
@@ -633,7 +689,9 @@ def resolve_adapter_path(
     model_name: str = None,
     adapter_config: Union[dict, str] = None,
     version: str = None,
-    **kwargs,
+    source: str = None,
+    redirect_to_hf_hub: bool = False,
+    **kwargs
 ) -> str:
     """
     Resolves the path to a pre-trained adapter module. Note: If attempting to resolve an adapter from the Hub,
@@ -648,6 +706,15 @@ def resolve_adapter_path(
         model_name (str, optional): The identifier of the pre-trained model for which to load an adapter.
         adapter_config (Union[dict, str], optional): The configuration of the adapter to be loaded.
         version (str, optional): The version of the adapter to be loaded. Defaults to None.
+        source (str, optional): Identifier of the source(s) from where to get adapters. Can be either:
+
+            - "ah": search on AdapterHub.ml. Note: this source is deprecated in favor of "hf".
+            - "hf": search on HuggingFace model hub (huggingface.co).
+            - None (default): search on all sources
+
+        redirect_to_hf_hub (bool, optional):
+            If set to True, the function will redirect to the HuggingFace Model Hub instead of AdapterHub.
+            Defaults to False.
 
     Returns:
         str: The local path from where the adapter module can be loaded.
@@ -672,13 +739,24 @@ def resolve_adapter_path(
                     WEIGHTS_NAME, CONFIG_NAME, adapter_name_or_path
                 )
             )
-    else:
+    elif source == "ah":
+        return pull_from_hub(
+            adapter_name_or_path,
+            model_name,
+            adapter_config=adapter_config,
+            version=version,
+            redirect_to_hf_hub=redirect_to_hf_hub,
+            **kwargs,
+        )
+    elif source == "hf":
+        return pull_from_hf_model_hub(adapter_name_or_path, version=version, **kwargs)
+    elif source is None:
         try:
-            logger.info("Attempting to load adapter from HF Model Hub...")
+            logger.info("Attempting to load adapter from source 'hf'...")
             return pull_from_hf_model_hub(adapter_name_or_path, version=version, **kwargs)
         except (EnvironmentError, ValueError) as ex:
             logger.info(ex)
-            logger.info("Attempting to redirect from archived Hub repo...")
+            logger.info("Attempting to load adapter from source 'ah'...")
             try:
                 return pull_from_hub(
                     adapter_name_or_path,
@@ -691,70 +769,103 @@ def resolve_adapter_path(
             except Exception as ex:
                 logger.info(ex)
                 raise EnvironmentError(
-                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source.".format(
-                        adapter_name_or_path
-                    )
+                    "Unable to load adapter {} from any source. Please check the name of the adapter or the source."
+                    .format(adapter_name_or_path)
                 )
+    else:
+        raise ValueError("Unable to identify {} as a valid module location.".format(adapter_name_or_path))
 
 
-def list_adapters(model_name: str = None) -> List[AdapterInfo]:
+def list_adapters(source: str = None, model_name: str = None) -> List[AdapterInfo]:
     """
     Retrieves a list of all publicly available adapters on AdapterHub.ml or on huggingface.co.
 
     Args:
+        source (str, optional): Identifier of the source(s) from where to get adapters. Can be either:
+
+            - "ah": search on AdapterHub.ml.
+            - "hf": search on HuggingFace model hub (huggingface.co).
+            - None (default): search on all sources
+
         model_name (str, optional): If specified, only returns adapters trained for the model with this identifier.
     """
     adapters = []
-    if "fetch_config" in inspect.signature(HfApi.list_models).parameters:
-        kwargs = {"full": True, "fetch_config": True}
-    else:
-        logger.warning(
-            "Using old version of huggingface-hub package for fetching. Please upgrade to latest version for"
-            " accurate results."
-        )
-        kwargs = {"full": True}
-    all_hf_adapters_data = HfApi().list_models(filter="adapters", **kwargs)
-    for model_info in all_hf_adapters_data:
-        adapter_info = AdapterInfo(
-            source="hf",
-            adapter_id=model_info.modelId,
-            model_name=model_info.config.get("adapters", {}).get("model_name") if model_info.config else None,
-            username=model_info.modelId.split("/")[0],
-            sha1_checksum=model_info.sha,
-        )
-        adapters.append(adapter_info)
+    if source == "ah" or source is None:
+        try:
+            all_ah_adapters_file = download_cached(ADAPTER_HUB_ALL_FILE)
+        except requests.exceptions.HTTPError:
+            raise EnvironmentError(
+                "Unable to load list of adapters from AdapterHub.ml. The service might be temporarily unavailable."
+            )
+        with open(all_ah_adapters_file, "r") as f:
+            all_ah_adapters_data = json.load(f)
+        adapters += [AdapterInfo(**info) for info in all_ah_adapters_data]
+    if source == "hf" or source is None:
+        if "fetch_config" in inspect.signature(HfApi.list_models).parameters:
+            kwargs = {"full": True, "fetch_config": True}
+        else:
+            logger.warning(
+                "Using old version of huggingface-hub package for fetching. Please upgrade to latest version for"
+                " accurate results."
+            )
+            kwargs = {"full": True}
+        all_hf_adapters_data = HfApi().list_models(filter="adapters", **kwargs)
+        for model_info in all_hf_adapters_data:
+            adapter_info = AdapterInfo(
+                source="hf",
+                adapter_id=model_info.modelId,
+                model_name=model_info.config.get("adapters", {}).get("model_name") if model_info.config else None,
+                username=model_info.modelId.split("/")[0],
+                sha1_checksum=model_info.sha,
+            )
+            adapters.append(adapter_info)
 
     if model_name is not None:
         adapters = [adapter for adapter in adapters if adapter.model_name == model_name]
     return adapters
 
 
-def get_adapter_info(adapter_id: str) -> Optional[AdapterInfo]:
+def get_adapter_info(adapter_id: str, source: str = "ah") -> Optional[AdapterInfo]:
     """
     Retrieves information about a specific adapter.
 
     Args:
         adapter_id (str): The identifier of the adapter to retrieve.
+        source (str, optional): Identifier of the source(s) from where to get adapters. Can be either:
+
+            - "ah": search on AdapterHub.ml.
+            - "hf": search on HuggingFace model hub (huggingface.co).
 
     Returns:
         AdapterInfo: The adapter information or None if the adapter was not found.
     """
-    try:
-        model_info = HfApi().model_info(adapter_id)
-        return AdapterInfo(
-            source="hf",
-            adapter_id=model_info.modelId,
-            model_name=(
-                model_info.config.get("adapter_transformers", {}).get("model_name") if model_info.config else None
-            ),
-            username=model_info.modelId.split("/")[0],
-            sha1_checksum=model_info.sha,
-        )
-    except requests.exceptions.HTTPError:
-        return None
+    if source == "ah":
+        if adapter_id.startswith("@"):
+            adapter_id = adapter_id[1:]
+        try:
+            data = http_get_json(f"/adapters/{adapter_id}.json")
+            return AdapterInfo(**data["info"])
+        except EnvironmentError:
+            return None
+    elif source == "hf":
+        try:
+            model_info = HfApi().model_info(adapter_id)
+            return AdapterInfo(
+                source="hf",
+                adapter_id=model_info.modelId,
+                model_name=model_info.config.get("adapter_transformers", {}).get("model_name")
+                if model_info.config
+                else None,
+                username=model_info.modelId.split("/")[0],
+                sha1_checksum=model_info.sha,
+            )
+        except requests.exceptions.HTTPError:
+            return None
+    else:
+        raise ValueError("Please specify either 'ah' or 'hf' as source.")
 
 
-def prefix_attention_mask(attention_mask, dim: Union[int, List[int]] = 3, prefix_value: int = 0):
+def prefix_attention_mask(attention_mask, dim: int = 3, prefix_value: int = 0):
     """
     Adds a prefix to an attention mask. The length of the prefix is determined by the `prefix_attention_mask_length`
     attribute in the ForwardContext.
@@ -779,21 +890,18 @@ def prefix_attention_mask(attention_mask, dim: Union[int, List[int]] = 3, prefix
         and forward_context is not None
         and getattr(forward_context, "prompt_tokens_length", None) is not None
     ):
-        if isinstance(dim, int):
-            dim = [dim]
-        for d in dim:
-            # Create a tensor of ones with the desired shape
-            ones_shape = list(attention_mask.shape)
-            ones_shape[d] = forward_context.prompt_tokens_length
-
-            prefix_attention_mask = torch.full(
-                ones_shape,
-                prefix_value,
-                dtype=attention_mask.dtype,
-            ).to(attention_mask.device)
-
-            # Concatenate the prefix_attention_mask along the specified dimension
-            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=d)
+        # Create a tensor of ones with the desired shape
+        ones_shape = list(attention_mask.shape)
+        ones_shape[dim] = forward_context.prompt_tokens_length
+
+        prefix_attention_mask = torch.full(
+            ones_shape,
+            prefix_value,
+            dtype=attention_mask.dtype,
+        ).to(attention_mask.device)
+
+        # Concatenate the prefix_attention_mask along the specified dimension
+        attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=dim)
 
     return attention_mask
 
diff --git a/src/adapters/wrappers/configuration.py b/src/adapters/wrappers/configuration.py
index ed224cd60..c49f3b8b7 100644
--- a/src/adapters/wrappers/configuration.py
+++ b/src/adapters/wrappers/configuration.py
@@ -46,12 +46,6 @@
         "hidden_dropout_prob": "dropout",
         "attention_probs_dropout_prob": "attention_dropout",
     },
-    "plbart": {
-        "num_attention_heads": "encoder_attention_heads",
-        "hidden_size": "d_model",
-        "hidden_dropout_prob": "dropout",
-        "attention_probs_dropout_prob": "attention_dropout",
-    },
     "roberta": {},
     "t5": {
         "hidden_size": "d_model",
diff --git a/src/adapters/wrappers/model.py b/src/adapters/wrappers/model.py
index 12ed79e12..23998a81b 100644
--- a/src/adapters/wrappers/model.py
+++ b/src/adapters/wrappers/model.py
@@ -95,7 +95,7 @@ def load_model(
     model_name_or_path: Optional[Union[str, os.PathLike]],
     model_class: Type[PreTrainedModel],
     *model_args: Any,
-    **kwargs: Any,
+    **kwargs: Any
 ) -> PreTrainedModel:
     """
     Loads a pretrained model with adapters from the given path or url.
diff --git a/tests/fixtures/samples/cifar10/cifar10.py b/tests/fixtures/samples/cifar10/cifar10.py
index 052a203df..cd00f0260 100644
--- a/tests/fixtures/samples/cifar10/cifar10.py
+++ b/tests/fixtures/samples/cifar10/cifar10.py
@@ -1,7 +1,6 @@
 """
 CIFAR-10 demo data, adapted from https://huggingface.co/datasets/cifar10.
 """
-
 import os
 import pickle
 
diff --git a/tests/methods/base.py b/tests/methods/base.py
index 6ede68f2f..3954aece4 100644
--- a/tests/methods/base.py
+++ b/tests/methods/base.py
@@ -46,7 +46,7 @@ def run_add_test(self, model, adapter_config, filter_keys):
 
         name = "test_adapter_" + adapter_config.__class__.__name__
         model.add_adapter(name, config=adapter_config)
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
         model.to(torch_device)
 
         # adapter is correctly added to config
@@ -67,7 +67,7 @@ def run_leave_out_test(self, model, adapter_config, leave_out):
         adapter_config = adapter_config.replace(leave_out=leave_out)
         name = "test_adapter_" + adapter_config.__class__.__name__
         model.add_adapter(name, config=adapter_config)
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
 
         # adapter is correctly added to config
         self.assert_adapter_available(model, name)
@@ -81,10 +81,10 @@ def run_leave_out_test(self, model, adapter_config, leave_out):
 
         model.delete_adapter(name)
 
-    def run_linear_average_test(self, model, adapter_config, filter_keys):
+    def run_average_test(self, model, adapter_config, filter_keys):
         model.eval()
 
-        weights = [-0.2, 0.9, 0.3]
+        weights = [0.1, 0.6, 0.3]
 
         # add adapters to average
         name = "test_adapter_" + adapter_config.__class__.__name__
@@ -103,9 +103,7 @@ def run_linear_average_test(self, model, adapter_config, filter_keys):
                     averaged_weights[base_k] += w * v
 
         # average adapters
-        model.average_adapter(
-            name, [name + f"_{i}" for i in range(len(weights))], weights=weights, combine_strategy="linear"
-        )
+        model.average_adapter(name, [name + f"_{i}" for i in range(len(weights))], weights=weights)
 
         # adapter is correctly added to config
         self.assertTrue(name in model.adapters_config)
@@ -121,7 +119,7 @@ def run_delete_test(self, model, adapter_config, filter_keys):
 
         name = "test_adapter_" + adapter_config.__class__.__name__
         model.add_adapter(name, config=adapter_config)
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
         model.to(torch_device)
 
         # adapter is correctly added to config
@@ -142,7 +140,7 @@ def run_get_test(self, model, adapter_config, num_expected_modules):
         model.eval()
 
         model.add_adapter("first", config=adapter_config)
-        model.set_active_adapters("first")
+        model.set_active_adapters(["first"])
 
         # adapter is correctly added to config
         name = "first"
@@ -167,7 +165,7 @@ def run_forward_test(self, model, adapter_config, dtype=torch.float32):
         input_data = self.get_input_samples(config=model.config, dtype=dtype)
 
         # pass 1: set adapter via property
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
         output_1 = model(**input_data)
 
         # pass 2: set via context
@@ -191,7 +189,7 @@ def run_load_test(self, adapter_config):
 
         name = "dummy_adapter"
         model1.add_adapter(name, config=adapter_config)
-        model1.set_active_adapters(name)
+        model1.set_active_adapters([name])
         with tempfile.TemporaryDirectory() as temp_dir:
             model1.save_adapter(temp_dir, name)
 
@@ -245,7 +243,7 @@ def run_full_model_load_test(self, adapter_config):
             output1 = model1(**input_data)
             output2 = model2(**input_data)
         self.assertEqual(len(output1), len(output2))
-        self.assertTrue(torch.allclose(output1[0], output2[0], atol=1e-4))
+        self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def trainings_run(self, model, lr=1.0, steps=8):
         # setup dataset
@@ -333,7 +331,7 @@ def run_merge_test(self, adapter_config):
         input_data = self.get_input_samples(config=model.config)
 
         # forward in training mode
-        model.set_active_adapters("test_lora")
+        model.set_active_adapters(["test_lora"])
         output_1 = model(**input_data)
 
         # forward in merged mode
diff --git a/tests/methods/test_adapter_common.py b/tests/methods/test_adapter_common.py
index 1ea6cd6f3..b1d67757b 100644
--- a/tests/methods/test_adapter_common.py
+++ b/tests/methods/test_adapter_common.py
@@ -53,13 +53,13 @@ def test_leave_out_adapter(self):
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
                 self.run_leave_out_test(model, adapter_config, self.leave_out_layers)
 
-    def test_linear_average_adapter(self):
+    def test_average_adapter(self):
         model = self.get_model()
         model.eval()
 
         for adapter_config, filter_keys in self.adapter_configs_to_test:
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
-                self.run_linear_average_test(model, adapter_config, filter_keys)
+                self.run_average_test(model, adapter_config, filter_keys)
 
     def test_delete_adapter(self):
         model = self.get_model()
@@ -79,7 +79,7 @@ def test_add_adapter_with_invertible(self):
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
                 name = adapter_config.__class__.__name__
                 model.add_adapter(name, config=adapter_config)
-                model.set_active_adapters(name)
+                model.set_active_adapters([name])
 
                 # adapter is correctly added to config
                 self.assertTrue(name in model.adapters_config)
@@ -128,7 +128,7 @@ def test_delete_adapter_with_invertible(self):
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
                 name = adapter_config.__class__.__name__
                 model.add_adapter(name, config=adapter_config)
-                model.set_active_adapters(name)
+                model.set_active_adapters([name])
 
                 # check if adapter is correctly added to config
                 self.assert_adapter_available(model, name)
@@ -178,7 +178,7 @@ def test_add_adapter_multiple_reduction_factors(self):
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
                 name = adapter_config.__class__.__name__
                 model.add_adapter(name, config=adapter_config)
-                model.set_active_adapters(name)
+                model.set_active_adapters([name])
 
                 # adapter is correctly added to config
                 self.assertTrue(name in model.adapters_config)
diff --git a/tests/methods/test_compacter.py b/tests/methods/test_compacter.py
index 75716ffa6..ffe7e0eae 100644
--- a/tests/methods/test_compacter.py
+++ b/tests/methods/test_compacter.py
@@ -14,11 +14,9 @@ def test_leave_out_compacter(self):
         model = self.get_model()
         self.run_leave_out_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), self.leave_out_layers)
 
-    def test_linear_average_compacter(self):
+    def test_average_compacter(self):
         model = self.get_model()
-        self.run_linear_average_test(
-            model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), ["adapters.{name}."]
-        )
+        self.run_average_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), ["adapters.{name}."])
 
     def test_delete_compacter(self):
         model = self.get_model()
diff --git a/tests/methods/test_ia3.py b/tests/methods/test_ia3.py
index 3a30e2448..0dc81d02b 100644
--- a/tests/methods/test_ia3.py
+++ b/tests/methods/test_ia3.py
@@ -14,9 +14,9 @@ def test_leave_out_ia3(self):
         model = self.get_model()
         self.run_leave_out_test(model, IA3Config(), self.leave_out_layers)
 
-    def test_linear_average_ia3(self):
+    def test_average_ia3(self):
         model = self.get_model()
-        self.run_linear_average_test(model, IA3Config(), ["loras.{name}."])
+        self.run_average_test(model, IA3Config(), ["loras.{name}."])
 
     def test_delete_ia3(self):
         model = self.get_model()
diff --git a/tests/methods/test_lora.py b/tests/methods/test_lora.py
index 067f78c8b..0ade2bdbb 100644
--- a/tests/methods/test_lora.py
+++ b/tests/methods/test_lora.py
@@ -1,9 +1,4 @@
-import random
-
-import torch
-
 from adapters import LoRAConfig
-from adapters.methods.lora import LoRALayer
 from transformers.testing_utils import require_torch
 
 from .base import AdapterMethodBaseTestMixin
@@ -19,268 +14,9 @@ def test_leave_out_lora(self):
         model = self.get_model()
         self.run_leave_out_test(model, LoRAConfig(), self.leave_out_layers)
 
-    def test_merging_with_other_adapters(self):
-        model = self.get_model()
-        model.add_adapter("lora", config="lora")
-
-        # Add different adapters
-        model.add_adapter("bottleneck", config="seq_bn")
-        model.add_adapter("prompt", config="prompt_tuning")
-        model.add_adapter("prefix", config="prefix_tuning")
-        model.add_adapter("ia3", config="ia3")
-        model.add_adapter("unipelt", config="unipelt")
-        model.add_adapter("mam", config="mam")
-        model.add_adapter("compacter", config="compacter[phm_dim=2, reduction_factor=8]")
-
-        # Merging adapters with different architectures with LoRA should raise a ValueError
-        for adapter_architecture in ["bottleneck", "prompt", "prefix", "ia3", "unipelt", "mam", "compacter"]:
-            with self.subTest(adapter_architecture=adapter_architecture):
-                with self.assertRaises(ValueError):
-                    model.average_adapter(
-                        adapter_name=f"average_lora_{adapter_architecture}",
-                        adapter_list=[adapter_architecture, "lora"],
-                        weights=[0.5, 0.5],
-                        combine_strategy="linear",
-                    )
-
-    def test_linear_average_lora(self):
-        model = self.get_model()
-        self.run_linear_average_test(model, LoRAConfig(), ["loras.{name}."])
-
-    def test_linear_average_only_negate_b_lora(self):
-        # This method tests that the linear average following the Zhang et al. 2023 paper works as expected.
-        # Paper: https://proceedings.neurips.cc/paper_files/paper/2023/hash/299a08ee712d4752c890938da99a77c6-Abstract-Conference.html
-        # This method is an adapted version of the `run_linear_average_test` method.
-        model = self.get_model()
-        model.eval()
-        weights = [-1, 1.5, 0.5]
-
-        # add adapters to average
-        name = "test_adapter_" + LoRAConfig().__class__.__name__
-        for i in range(len(weights)):
-            model.add_adapter(
-                f"{name}_{i}",
-                config=LoRAConfig(
-                    dropout=random.random(),
-                    init_weights=["bert", "lora"][i % 2],
-                ),
-            )
-
-        averaged_weights = {}
-        for i, w in enumerate(weights):
-            this_filter_keys = [k.format(name=f"{name}_{i}") for k in ["loras.{name}."]]
-            for k, v in self.filter_parameters(model, this_filter_keys).items():
-                base_k = k.replace(f"{name}_{i}", name)
-                # Only negate the lora_B weights and use the absolute value of the weight for lora_A weights.
-                weight = abs(w) if "lora_A" in k else w
-                if base_k not in averaged_weights:
-                    averaged_weights[base_k] = weight * v
-                else:
-                    averaged_weights[base_k] += weight * v
-
-        # average adapters
-        model.average_adapter(
-            name,
-            [f"{name}_{i}" for i in range(len(weights))],
-            weights=weights,
-            combine_strategy="lora_linear_only_negate_b",
-        )
-
-        # adapter is correctly added to config
-        self.assertTrue(name in model.adapters_config)
-        config = model.adapters_config.get(name)
-        self.assertEqual(LoRAConfig(dropout=config.dropout, init_weights=config.init_weights), config)
-
-        # compare averaged weights to collected weights
-        this_filter_keys = [k.format(name=name) for k in ["loras.{name}."]]
-        for k, v in self.filter_parameters(model, this_filter_keys).items():
-            self.assertTrue(torch.allclose(v, averaged_weights[k]), k)
-
-    def _check_svd_weights(self, delta_w, merged_lora, svd_rank, atol=1e-5):
-        # Compute SVD of the original delta_w
-        u, s, v = torch.svd(delta_w)
-        u = u[:, :svd_rank]
-        s = s[:svd_rank]
-        v = v[:, :svd_rank]
-
-        # Reconstruct A and B matrices
-        expected_A = v.t()
-        expected_B = u @ torch.diag(s)
-
-        # Compare with merged adapter
-        self.assertTrue(torch.allclose(expected_A, merged_lora.lora_A, atol=atol))
-        self.assertTrue(torch.allclose(expected_B, merged_lora.lora_B, atol=atol))
-
-    def test_linear_delta_w_svd_average_lora(self):
-        model = self.get_model()
-        model.eval()
-        model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd
-        weights = [-1, 1.5, 0.5]
-
-        # add adapters to average
-        name = "test_adapter_" + LoRAConfig().__class__.__name__
-        for i in range(len(weights)):
-            model.add_adapter(
-                f"{name}_{i}",
-                config=LoRAConfig(
-                    dropout=random.random(),
-                    init_weights=["bert", "lora"][i % 2],
-                ),
-            )
-
-        if not model_supports_lora_delta_w_svd:
-            # Some models (GPT2, Deberta) don't support this merging method
-            with self.assertRaises(ValueError):
-                model.average_adapter(
-                    "averaged_adapter",
-                    [f"{name}_{i}" for i in range(len(weights))],
-                    weights=weights,
-                    combine_strategy="lora_delta_w_svd",
-                )
-
-            return
-
-        # average adapters
-        svd_rank = 16
-        model.average_adapter(
-            "averaged_adapter",
-            [f"{name}_{i}" for i in range(len(weights))],
-            weights=weights,
-            combine_strategy="lora_delta_w_svd",
-            svd_rank=svd_rank,
-        )
-
-        # adapter is correctly added to config
-        self.assertTrue("averaged_adapter" in model.adapters_config)
-        config = model.adapters_config.get("averaged_adapter")
-        self.assertEqual(LoRAConfig(dropout=config.dropout, init_weights=config.init_weights, r=svd_rank), config)
-
-        # Calculate the new weights: Matrix A and B are SVD of all the weighted delta_w matrices of the adapters.
-        for i, layer in model.iter_layers():
-            for module in layer.modules():
-                if isinstance(module, LoRALayer):
-                    # Check if this layer has the LoRA adapters
-                    if not (
-                        f"{name}_0" in module.loras
-                        and f"{name}_1" in module.loras
-                        and f"{name}_2" in module.loras
-                        and name in module.loras
-                    ):
-                        continue
-
-                    # Calculate the new weights
-                    delta_w_1 = module.loras[name + "_0"].delta_w
-                    delta_w_2 = module.loras[name + "_1"].delta_w
-                    delta_w_3 = module.loras[name + "_2"].delta_w
-                    delta_w = weights[0] * delta_w_1 + weights[1] * delta_w_2 + weights[2] * delta_w_3
-
-                    self._check_svd_weights(delta_w, module.loras["averaged_adapter"], svd_rank)
-
-    def test_edge_case_average_adapters_single_adapter(self):
-        # If we merge only one adapter, the weights of the new adapter should be the same as the original adapter
+    def test_average_lora(self):
         model = self.get_model()
-        model.eval()
-        model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd
-
-        # add adapters to average
-        name = "test_adapter_" + LoRAConfig().__class__.__name__
-        for i in range(3):
-            model.add_adapter(
-                f"{name}_{i}",
-                config=LoRAConfig(
-                    dropout=random.random(),
-                    init_weights=["bert", "lora"][i % 2],
-                ),
-            )
-
-        # collect weights of the first adapter so we can compare them to the newly created adapters in the subsequent tests
-        filter_keys_adapter_0 = [k.format(name=f"{name}_0") for k in ["loras.{name}."]]
-        adapter_0 = self.filter_parameters(model, filter_keys_adapter_0)
-
-        # Run tests for every combine strategy
-        for combine_strategy in ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"]:
-            if not model_supports_lora_delta_w_svd and combine_strategy == "lora_delta_w_svd":
-                continue
-
-            with self.subTest(combine_strategy=combine_strategy):
-                svd_rank = LoRAConfig().r if combine_strategy == "lora_delta_w_svd" else None
-                model.average_adapter(
-                    adapter_name=f"{combine_strategy}_merged",
-                    adapter_list=[f"{name}_0"],
-                    weights=[1],
-                    combine_strategy=combine_strategy,
-                    svd_rank=svd_rank,
-                )
-
-                filter_keys = [k.format(name=f"{combine_strategy}_merged") for k in ["loras.{name}."]]
-
-                if combine_strategy != "lora_delta_w_svd":
-                    for k, v in self.filter_parameters(model, filter_keys).items():
-                        adapter_0_key = k.replace(f"{combine_strategy}_merged", f"{name}_0")
-                        self.assertTrue(torch.allclose(v, adapter_0[adapter_0_key]))
-                else:
-                    # For lora_delta_w_svd, we need to calculate the expected weights since lora_delta_w_svd performs an SVD
-                    for i, layer in model.iter_layers():
-                        for module in layer.modules():
-                            if isinstance(module, LoRALayer):
-                                if f"{name}_0" in module.loras and f"{combine_strategy}_merged" in module.loras:
-                                    original_lora = module.loras[f"{name}_0"]
-                                    merged_lora = module.loras[f"{combine_strategy}_merged"]
-                                    self._check_svd_weights(original_lora.delta_w, merged_lora, svd_rank)
-
-    def test_edge_case_average_adapters_multiple_adapters(self):
-        # If we merge multiple adapters with weight 0 except one adapter with weight 1, the resulting adapter should be the same as the adapter with weight 1
-        model = self.get_model()
-        model.eval()
-        model_supports_lora_delta_w_svd = model.base_model.support_lora_delta_w_svd
-
-        # add adapters to average
-        name = "test_adapter_" + LoRAConfig().__class__.__name__
-        for i in range(3):
-            model.add_adapter(
-                f"{name}_{i}",
-                config=LoRAConfig(
-                    dropout=random.random(),
-                    init_weights=["bert", "lora"][i % 2],
-                ),
-            )
-
-        # collect weights of the first adapter so we can compare them to the newly created adapters in the subsequent tests
-        filter_keys_adapter_0 = [k.format(name=f"{name}_0") for k in ["loras.{name}."]]
-        adapter_0 = self.filter_parameters(model, filter_keys_adapter_0)
-
-        # Run tests for every combine strategy
-        for combine_strategy in ["linear", "lora_linear_only_negate_b", "lora_delta_w_svd"]:
-            if not model_supports_lora_delta_w_svd and combine_strategy == "lora_delta_w_svd":
-                continue
-
-            with self.subTest(combine_strategy=combine_strategy):
-                svd_rank = LoRAConfig().r if combine_strategy == "lora_delta_w_svd" else None
-
-                # since normalize_weights is True, this should result in only the first adapter being used with a weight of 1
-                model.average_adapter(
-                    adapter_name=f"{combine_strategy}_merged",
-                    adapter_list=[f"{name}_0", f"{name}_1", f"{name}_2"],
-                    weights=[0.5, 0, 0],
-                    combine_strategy=combine_strategy,
-                    svd_rank=svd_rank,
-                )
-
-                filter_keys = [k.format(name=f"{combine_strategy}_merged") for k in ["loras.{name}."]]
-
-                if combine_strategy != "lora_delta_w_svd":
-                    for k, v in self.filter_parameters(model, filter_keys).items():
-                        adapter_1_key = k.replace(f"{combine_strategy}_merged", f"{name}_0")
-                        self.assertTrue(torch.allclose(v, adapter_0[adapter_1_key]))
-                else:
-                    # For lora_delta_w_svd, we need to calculate the expected weights since lora_delta_w_svd performs an SVD
-                    for i, layer in model.iter_layers():
-                        for module in layer.modules():
-                            if isinstance(module, LoRALayer):
-                                if f"{name}_0" in module.loras and f"{combine_strategy}_merged" in module.loras:
-                                    original_lora = module.loras[f"{name}_0"]
-                                    merged_lora = module.loras[f"{combine_strategy}_merged"]
-                                    self._check_svd_weights(original_lora.delta_w, merged_lora, svd_rank)
+        self.run_average_test(model, LoRAConfig(), ["loras.{name}."])
 
     def test_delete_lora(self):
         model = self.get_model()
diff --git a/tests/methods/test_prefix_tuning.py b/tests/methods/test_prefix_tuning.py
index 2b351d0fc..a1c41268b 100644
--- a/tests/methods/test_prefix_tuning.py
+++ b/tests/methods/test_prefix_tuning.py
@@ -19,9 +19,9 @@ def test_leave_out_prefix_tuning(self):
         model = self.get_model()
         self.run_leave_out_test(model, PrefixTuningConfig(flat=True), self.leave_out_layers)
 
-    def test_linear_average_prefix_tuning(self):
+    def test_average_prefix_tuning(self):
         model = self.get_model()
-        self.run_linear_average_test(model, PrefixTuningConfig(flat=True), ["prefix_tunings.{name}."])
+        self.run_average_test(model, PrefixTuningConfig(flat=True), ["prefix_tunings.{name}."])
 
     def test_delete_prefix_tuning(self):
         model = self.get_model()
@@ -62,7 +62,7 @@ def test_eject_prefix(self):
         input_data = self.get_input_samples(config=model.config)
 
         # user reparamterized prefix
-        model.set_active_adapters("test_prefix")
+        model.set_active_adapters(["test_prefix"])
         output_1 = model(**input_data)
 
         # eject prefix
diff --git a/tests/methods/test_prompt_tuning.py b/tests/methods/test_prompt_tuning.py
index 97015d131..a5150e1aa 100644
--- a/tests/methods/test_prompt_tuning.py
+++ b/tests/methods/test_prompt_tuning.py
@@ -10,9 +10,9 @@ def test_add_prompt_tuning(self):
         model = self.get_model()
         self.run_add_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."])
 
-    def test_linear_average_prompt_tuning(self):
+    def test_average_prompt_tuning(self):
         model = self.get_model()
-        self.run_linear_average_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."])
+        self.run_average_test(model, PromptTuningConfig(prompt_length=10), ["prompt_tunings.{name}."])
 
     def test_delete_prompt_tuning(self):
         model = self.get_model()
diff --git a/tests/methods/test_reft.py b/tests/methods/test_reft.py
index 884922180..5089d4ce1 100644
--- a/tests/methods/test_reft.py
+++ b/tests/methods/test_reft.py
@@ -29,7 +29,7 @@ def test_layers_reft(self):
                 adapter_config = adapter_config.replace(layers=layers)
                 name = "test_adapter_" + adapter_config.__class__.__name__
                 model.add_adapter(name, config=adapter_config)
-                model.set_active_adapters(name)
+                model.set_active_adapters([name])
 
                 # adapter is correctly added to config
                 self.assert_adapter_available(model, name)
@@ -47,7 +47,7 @@ def test_average_reft(self):
         model = self.get_model()
         for adapter_config, filter_keys in self.reft_configs_to_test:
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
-                self.run_linear_average_test(model, adapter_config, filter_keys)
+                self.run_average_test(model, adapter_config, filter_keys)
 
     def test_delete_reft(self):
         model = self.get_model()
diff --git a/tests/methods/test_unipelt.py b/tests/methods/test_unipelt.py
index d29fa5f18..83bbec522 100644
--- a/tests/methods/test_unipelt.py
+++ b/tests/methods/test_unipelt.py
@@ -10,11 +10,9 @@ def test_add_unipelt(self):
         model = self.get_model()
         self.run_add_test(model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."])
 
-    def test_linear_average_unipelt(self):
+    def test_average_unipelt(self):
         model = self.get_model()
-        self.run_linear_average_test(
-            model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."]
-        )
+        self.run_average_test(model, UniPELTConfig(), ["loras.{name}.", "adapters.{name}.", "prefix_tunings.{name}."])
 
     def test_delete_unipelt(self):
         model = self.get_model()
@@ -53,7 +51,7 @@ def test_output_adapter_gating_scores_unipelt(self):
 
         input_data = self.get_input_samples(config=model.config)
 
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
         output_1 = model(**input_data, output_adapter_gating_scores=True)
 
         self.assertEqual(len(output_1[0]), self.default_input_samples_shape[0])
diff --git a/tests/models/test_plbart.py b/tests/models/test_plbart.py
deleted file mode 100644
index 7fbbfc38d..000000000
--- a/tests/models/test_plbart.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# flake8: noqa: F403,F405
-from adapters import PLBartAdapterModel
-from hf_transformers.tests.models.plbart.test_modeling_plbart import *
-from transformers.testing_utils import require_torch
-
-from .base import AdapterModelTesterMixin
-
-
-@require_torch
-class PLBartAdapterModelTest(AdapterModelTesterMixin, PLBartModelTest):
-    all_model_classes = (PLBartAdapterModel,)
-    fx_compatible = False
diff --git a/tests/test_adapter_backward_compability.py b/tests/test_adapter_backward_compability.py
index 6ec2ef214..03c04c792 100644
--- a/tests/test_adapter_backward_compability.py
+++ b/tests/test_adapter_backward_compability.py
@@ -14,7 +14,7 @@ def test_load_old_non_linearity(self):
         config = SeqBnConfig(non_linearity="gelu")
         name = "dummy"
         model1.add_adapter(name, config=config)
-        model1.set_active_adapters(name)
+        model1.set_active_adapters([name])
         with tempfile.TemporaryDirectory() as temp_dir:
             model1.save_adapter(temp_dir, name)
 
@@ -39,10 +39,10 @@ def test_save_version_with_adapter(self):
         config = SeqBnConfig(non_linearity="gelu")
         name = "dummy"
         model.add_adapter(name, config=config)
-        model.set_active_adapters(name)
+        model.set_active_adapters([name])
         with tempfile.TemporaryDirectory() as temp_dir:
             model.save_adapter(temp_dir, name)
 
             with open(os.path.join(temp_dir, "adapter_config.json"), "r") as file:
                 data = json.load(file)
-                self.assertEqual(__version__, data["version"].replace("adapters.", ""))
+                self.assertEqual(__version__, data["version"])
diff --git a/tests/test_adapter_config.py b/tests/test_adapter_config.py
index db57aeae2..2bce31c7e 100644
--- a/tests/test_adapter_config.py
+++ b/tests/test_adapter_config.py
@@ -18,6 +18,15 @@
 
 @require_torch
 class AdapterConfigTest(unittest.TestCase):
+    def test_config_load(self):
+        download_kwargs = {"force_download": True}
+        # TODO still uses the old config names as only these are available on the Hub
+        for config_name in ["pfeiffer", "houlsby"]:
+            with self.subTest(config_name=config_name):
+                config = AdapterConfig.load(config_name, download_kwargs=download_kwargs, non_linearity="leakyrelu")
+                self.assertTrue(isinstance(config, AdapterConfig))
+                self.assertEqual(config.non_linearity, "leakyrelu")
+
     def test_config_immutable(self):
         def set_attr(config: AdapterConfig):
             config.non_linearity = "dummy"
diff --git a/tests/test_adapter_fusion_common.py b/tests/test_adapter_fusion_common.py
index ccc860f66..4ee25fa06 100644
--- a/tests/test_adapter_fusion_common.py
+++ b/tests/test_adapter_fusion_common.py
@@ -38,7 +38,7 @@ def test_add_adapter_fusion(self):
 
                 # check forward pass
                 input_data = self.get_input_samples(config=model.config)
-                model.set_active_adapters(Fuse(name1, name2))
+                model.set_active_adapters([[name1, name2]])
                 model.to(torch_device)
                 adapter_output = model(**input_data)
                 model.set_active_adapters(None)
@@ -93,7 +93,7 @@ def test_load_adapter_fusion(self):
                 model2.eval()
 
                 model1.add_adapter_fusion([name1, name2], adater_fusion_config_name)
-                model1.set_active_adapters(Fuse(name1, name2))
+                model1.set_active_adapters([[name1, name2]])
 
                 with tempfile.TemporaryDirectory() as temp_dir:
                     model1.save_adapter_fusion(temp_dir, ",".join([name1, name2]))
@@ -136,8 +136,8 @@ def test_load_full_model_fusion(self):
 
         # check equal output
         input_data = self.get_input_samples(config=model1.config)
-        model1.set_active_adapters(Fuse(name1, name2))
-        model2.set_active_adapters(Fuse(name1, name2))
+        model1.set_active_adapters([[name1, name2]])
+        model2.set_active_adapters([[name1, name2]])
         model1.to(torch_device)
         model2.to(torch_device)
         output1 = model1(**input_data)
diff --git a/tests/test_adapter_heads.py b/tests/test_adapter_heads.py
index 0de9134c0..af1749a94 100644
--- a/tests/test_adapter_heads.py
+++ b/tests/test_adapter_heads.py
@@ -6,7 +6,6 @@
 import adapters
 from adapters import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoAdapterModel
 from adapters.composition import BatchSplit, Stack
-from adapters.heads import PredictionHead
 from transformers import AutoModelForSequenceClassification
 from transformers.testing_utils import require_torch, torch_device
 
@@ -456,95 +455,3 @@ def test_save_all_adapters_with_head(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.save_all_adapters(tmp_dir, with_head=False)
             self.assertFalse(os.path.isfile(os.path.join(tmp_dir, "test", "head_config.json")))
-
-    def test_average_head(self):
-        # Test the average_head method
-        model = AutoAdapterModel.from_config(self.config())
-        model.eval()
-
-        # Add adapters (this is just to see if the method also works if some heads are associated with an adapter while others are not)
-        for i in range(2):
-            model.add_adapter(f"adapter_{i}")
-
-        # Add heads
-        for i in range(3):
-            self.add_head(model, f"adapter_{i}")
-
-        # Calculate the expected weights of the new head
-        weights = [0.75, 0.25, -0.25]
-        expected_new_head_weights = {}
-
-        for i, weight in enumerate(weights):
-            current_head: PredictionHead = model.heads[f"adapter_{i}"]
-            for k, v in current_head.named_parameters():
-                base_k = k.replace(f"adapter_{i}", "new_head")
-                if base_k not in expected_new_head_weights:
-                    expected_new_head_weights[base_k] = weight * v
-                else:
-                    expected_new_head_weights[base_k] += weight * v
-
-        # Average the heads
-        model.average_head(
-            head_name="new_head",
-            head_list=["adapter_0", "adapter_1", "adapter_2"],
-            weights=weights,
-            normalize_weights=False,
-        )
-
-        # Check that the new head was added
-        self.assertIn("new_head", model.heads)
-
-        # Now, check that the actual weights are the same as the expected weights.
-        # Problem: Some heads might have tied weights. These weights therefore are the same as the embedding weights and are NOT the same as the expected weights dictionary.
-
-        # 1. Identify if a layer has tied weights
-        head1 = model.heads["adapter_0"]
-        tied_weight_keys = set()
-        if head1.get_output_embeddings() and model.config.tie_word_embeddings:
-            output_embeddings = head1.get_output_embeddings()
-
-            # Depending on the head, the tied layer has a different number: Find the layer number of the output embeddings
-            for name, module in head1.named_modules():
-                if module is output_embeddings:
-                    layer_prefix = name + "."
-                    break
-
-            for k, _ in output_embeddings.named_parameters():
-                tied_weight_keys.add(f"{layer_prefix}{k}")
-
-        print(f"tied_weight_keys: {tied_weight_keys}")
-
-        # 2. Compare the weights of the new head with the expected weights
-        for k, v in model.heads["new_head"].named_parameters():
-            if k not in tied_weight_keys:
-                self.assertTrue(torch.allclose(v, expected_new_head_weights[k]), k)
-
-        # 3. Last check: Ensure that tied weights are actually tied
-        if model.config.tie_word_embeddings:
-            input_embeddings = model.get_input_embeddings()
-            output_embeddings = model.heads["new_head"].get_output_embeddings()
-            if output_embeddings is not None:
-                self.assertTrue(
-                    torch.allclose(input_embeddings.weight, output_embeddings.weight),
-                    "Input and output embeddings are not properly tied",
-                )
-
-    def test_tied_head_weights(self):
-        # Some heads tie the weights of the last layer to the input embeddings. This test checks that these weights are not trained, except when setting train_embeddings=True
-        model = AutoAdapterModel.from_config(self.config())
-        model.eval()
-
-        # Check if model has add_masked_lm_head method
-        if "masked_lm" not in ADAPTER_MODEL_MAPPING[self.config_class].head_types:
-            self.skipTest("Model does not have masked language model head, skip test")
-
-        model.add_adapter("mlm")
-        model.add_masked_lm_head("mlm")
-
-        # 1. No training of embeddings => weights should not change
-        model.train_adapter("mlm")
-        self.assertFalse(model.heads["mlm"].get_output_embeddings().weight.requires_grad)
-
-        # 2. Training of embeddings => weights should change
-        model.train_adapter("mlm", train_embeddings=True)
-        self.assertTrue(model.heads["mlm"].get_output_embeddings().weight.requires_grad)
diff --git a/tests/test_adapter_hub.py b/tests/test_adapter_hub.py
index fa29d13b1..267ab06d4 100644
--- a/tests/test_adapter_hub.py
+++ b/tests/test_adapter_hub.py
@@ -84,7 +84,7 @@ def test_load_task_adapter_from_hub(self):
                     args=training_args,
                     eval_dataset=eval_dataset,
                     compute_metrics=self._compute_glue_metrics("mrpc"),
-                    adapter_names="mrpc",
+                    adapter_names=["mrpc"],
                 )
                 result = trainer.evaluate()
                 self.assertGreater(result["eval_acc"], 0.9)
diff --git a/tests/test_adapter_safetensors.py b/tests/test_adapter_safetensors.py
index 3c743c7a9..ef80dd43d 100644
--- a/tests/test_adapter_safetensors.py
+++ b/tests/test_adapter_safetensors.py
@@ -43,7 +43,7 @@ def test_safetensors_adapter(self):
         name = "test_adapter"
         model1.add_adapter(name)
         model1.add_classification_head(name, num_labels=2)
-        model1.set_active_adapters(name)
+        model1.set_active_adapters([name])
         temp_dir = tempfile.TemporaryDirectory()
 
         # Save & reload adapter
@@ -58,7 +58,7 @@ def test_safetensors_adapter(self):
         self.assertEqual(0, len(loading_info["unexpected_keys"]))
         # check if adapter was correctly loaded
         self.assertTrue(name in model2.adapters_config)
-        model2.set_active_adapters(name)
+        model2.set_active_adapters([name])
 
         # check equal output
         input_data = self.get_input_samples((2, 32))
diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py
index d313b656e..7fc358705 100644
--- a/tests/test_adapter_trainer.py
+++ b/tests/test_adapter_trainer.py
@@ -143,7 +143,7 @@ def test_resume_training_with_fusion(self):
             model.add_adapter("additional_adapter")
             model.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
             model.set_active_adapters(Fuse("adapter", "additional_adapter"))
-            model.train_adapter_fusion(Fuse("adapter", "additional_adapter"))
+            model.train_fusion(Fuse("adapter", "additional_adapter"))
 
             training_args = TrainingArguments(
                 output_dir=tmpdirname,
@@ -167,7 +167,7 @@ def test_resume_training_with_fusion(self):
             model_resume.add_adapter("additional_adapter")
             model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
             model_resume.set_active_adapters(Fuse("adapter", "additional_adapter"))
-            model_resume.train_adapter_fusion(Fuse("adapter", "additional_adapter"))
+            model_resume.train_fusion(Fuse("adapter", "additional_adapter"))
             trainer_resume = AdapterTrainer(
                 model=model_resume,
                 args=TrainingArguments(do_train=True, max_steps=1, output_dir=tmpdirname),
diff --git a/tests/test_mistral.py b/tests/test_mistral.py
index b10065a70..3e5d970d4 100644
--- a/tests/test_mistral.py
+++ b/tests/test_mistral.py
@@ -3,6 +3,8 @@
 from transformers.models.mistral.configuration_mistral import MistralConfig
 from transformers.testing_utils import require_torch
 
+from transformers.testing_utils import require_torch
+
 from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
 from .methods import (
     BottleneckAdapterTestMixin,
@@ -27,13 +29,12 @@ class MistralAdapterTestBase(AdapterTestBase):
         MistralConfig,
         hidden_size=32,
         num_hidden_layers=5,
-        num_attention_heads=8,
+        num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
-        hidden_dropout_prob=0.1,
         pad_token_id=0,
     )
-    tokenizer_name = "HuggingFaceH4/zephyr-7b-beta"
+    tokenizer_name = "mistralai/Mistral-7B-v0.1"
 
 
 @require_torch
@@ -63,4 +64,5 @@ class MistralClassConversionTest(
     MistralAdapterTestBase,
     unittest.TestCase,
 ):
-    pass
+    def test_conversion_question_answering_model(self):
+        raise self.skipTest("We don't support the Mistral QA model.")
diff --git a/tests/test_plbart.py b/tests/test_plbart.py
deleted file mode 100644
index aa8445791..000000000
--- a/tests/test_plbart.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import unittest
-
-from tests.methods.test_config_union import ConfigUnionAdapterTest
-from transformers import PLBartConfig
-from transformers.testing_utils import require_torch
-
-from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
-from .methods import (
-    BottleneckAdapterTestMixin,
-    CompacterTestMixin,
-    IA3TestMixin,
-    LoRATestMixin,
-    PrefixTuningTestMixin,
-    UniPELTTestMixin,
-)
-from .test_adapter import AdapterTestBase, make_config
-from .test_adapter_backward_compability import CompabilityTestMixin
-from .test_adapter_conversion import ModelClassConversionTestMixin
-from .test_adapter_embeddings import EmbeddingTestMixin
-from .test_adapter_fusion_common import AdapterFusionModelTestMixin
-from .test_adapter_heads import PredictionHeadModelTestMixin
-
-
-class PLBartAdapterTestBase(AdapterTestBase):
-    config_class = PLBartConfig
-    config = make_config(
-        PLBartConfig,
-        d_model=16,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        encoder_ffn_dim=4,
-        decoder_ffn_dim=4,
-        scale_embedding=False,  # Required for embedding tests
-    )
-    tokenizer_name = "uclanlp/plbart-base"
-
-
-@require_torch
-class PLBartAdapterTest(
-    BottleneckAdapterTestMixin,
-    CompacterTestMixin,
-    IA3TestMixin,
-    LoRATestMixin,
-    PrefixTuningTestMixin,
-    UniPELTTestMixin,
-    AdapterFusionModelTestMixin,
-    CompabilityTestMixin,
-    EmbeddingTestMixin,
-    PredictionHeadModelTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    ConfigUnionAdapterTest,
-    PLBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class PLBartClassConversionTest(
-    ModelClassConversionTestMixin,
-    PLBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
diff --git a/utils/back_comp/README.md b/utils/back_comp/README.md
deleted file mode 100644
index 14896c613..000000000
--- a/utils/back_comp/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Backwards Compatibility Tests
-
-## Motivation
-
-This directory contains a set of tests that can be run to ensure that newly introduced changes or refactorings do not break existing functionalities. These tests verify model output consistency between two branches; here, we use the names `dev` and `main` for demonstration purposes, but these tests can be performed between any two branches where the `back_comp` directory with tests is available.
-For this, the test script performs a forward pass for each supported model and compares the outputs between `dev` and `main` to identify any differences.
-
-## Requirements
-
-To execute these tests, you must meet the following requirements:
-
-- Ability to run bash scripts (in-built on Linux/macOS; for Windows, consider using third-party software like [GNU Bash](https://www.gnu.org/software/bash/)).
-- Git as the version control system to switch between branches.
-- The ability to check out the desired branch. If the branch is from another fork, you may need to add the repository as a remote. Refer to [GitHub's instructions](https://docs.github.com/en/get-started/getting-started-with-git/managing-remote-repositories) for details.
-- A Python virtual environment to modify the installed package version of `adapters`.
-
-## Procedure
-
-To perform the compatibility tests, follow these steps:
-
-1. Determine a directory where you want to save the model output generated by the tests. Save this directory path to the variable `SaveDir` in the shell script `compare.sh`. (Careful: select a directory OUTSIDE of the repository; otherwise, the saved model output is no longer available when changing the branch.)
-2. Select the branch you want to compare with `main` and save its name to the variable `Branch`.
-3. Make sure you are checked out in `main` before starting the test script. 
-4. In your command line, navigate to the `back_comp` directory and execute the script by running `sh compare.sh`.
-
-The results will be displayed in the command line for visualization.
\ No newline at end of file
diff --git a/utils/back_comp/Utils.py b/utils/back_comp/Utils 2.py
similarity index 97%
rename from utils/back_comp/Utils.py
rename to utils/back_comp/Utils 2.py
index 21c15545f..8ed482130 100644
--- a/utils/back_comp/Utils.py
+++ b/utils/back_comp/Utils 2.py	
@@ -29,7 +29,6 @@
     GPT2Config,
     GPTJConfig,
     MBartConfig,
-    PLBartConfig,
     RobertaConfig,
     T5Config,
     ViTConfig,
@@ -131,7 +130,6 @@ def get_model_names():
         "gpt2",
         "gptj",
         "mbart",
-        "plbart",
         "roberta",
         "t5",
         "vit",
@@ -285,19 +283,6 @@ def create_model(model_name: str, model_class: Any) -> Any:
         )
         model = model_class.from_config(mbart_config)
 
-    elif model_name == "plbart":
-        plbart_config = PLBartConfig(
-            d_model=16,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=4,
-            decoder_attention_heads=4,
-            encoder_ffn_dim=4,
-            decoder_ffn_dim=4,
-            vocab_size=50005,
-        )
-        model = model_class.from_config(plbart_config)
-
     elif model_name == "roberta":
         roberta_config = RobertaConfig(
             hidden_size=32,
diff --git a/utils/back_comp/compare_outputs.py b/utils/back_comp/compare_outputs.py
deleted file mode 100644
index 0775bf1bd..000000000
--- a/utils/back_comp/compare_outputs.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import argparse
-import os
-
-from Utils import (
-    compare_lists_close,
-    convert_tensors_to_list,
-    create_output,
-    fix_seeds,
-    get_model_names,
-    get_new_adapter_config_strings,
-    load_model,
-    restore_from_jsonl,
-)
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--path", type=str)
-args = parser.parse_args()
-
-
-# Create the root path
-base_dir = os.path.join(args.path, "model_outputs")
-fix_seeds()
-
-for model_name in get_model_names():
-    # Load the reference model
-    print(f"Model = {model_name}")
-    model_dir = os.path.join(base_dir, model_name)
-    model = load_model(model_name, os.path.join(model_dir, "model_weights"))
-
-    for adapter_config in get_new_adapter_config_strings():
-        # Create a new model output
-        adapter_name = model.load_adapter(os.path.join(model_dir, "weights_" + adapter_config))
-        model.set_active_adapters(adapter_name)
-        model_output = create_output(model, model_name)
-
-        # Compare the model output to the reference output
-        model_output_n, last_hidden_state = convert_tensors_to_list(model_output)
-        ref_output = restore_from_jsonl(config=adapter_config, file_path=os.path.join(model_dir, "output.jsonl"))
-        is_equal = compare_lists_close(ref_output, model_output_n, rtol=1e-05, atol=1e-08)
-        print(f"Adapter: {adapter_config} -> {is_equal}")
-
-        model.delete_adapter(adapter_name)
diff --git a/utils/convert_xmod_checkpoint.py b/utils/convert_xmod_checkpoint.py
index b3744fece..30ca0ede7 100644
--- a/utils/convert_xmod_checkpoint.py
+++ b/utils/convert_xmod_checkpoint.py
@@ -1,7 +1,6 @@
 """
 This script can be used to convert an Xmod checkpoints (including adapters) from the HF format to the Adapters format.
 """
-
 import argparse
 import os
 import re