From f48a47bdda5ed044b7634181558cacf2f29d5757 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 15:47:21 +0000
Subject: [PATCH 01/56] remove attributes and add all missing sub processors to
 their auto classes

---
 .../models/align/processing_align.py          |   3 -
 .../models/altclip/processing_altclip.py      |   4 -
 .../models/aria/processing_aria.py            |   4 -
 .../models/auto/feature_extraction_auto.py    |   9 +
 .../models/auto/image_processing_auto.py      |  17 +-
 .../models/auto/processing_auto.py            |   1 +
 .../models/auto/tokenization_auto.py          |  26 +++
 .../models/auto/video_processing_auto.py      |   1 +
 .../aya_vision/processing_aya_vision.py       |   4 -
 .../models/bark/processing_bark.py            |   1 -
 .../models/blip/processing_blip.py            |   4 -
 .../models/blip_2/processing_blip_2.py        |   4 -
 .../bridgetower/processing_bridgetower.py     |   3 -
 .../models/bros/processing_bros.py            |   1 -
 .../models/chameleon/processing_chameleon.py  |   4 -
 .../chinese_clip/processing_chinese_clip.py   |   4 -
 .../models/clap/processing_clap.py            |   3 -
 .../models/clip/processing_clip.py            |   4 -
 .../models/clipseg/processing_clipseg.py      |   4 -
 .../models/clvp/processing_clvp.py            |   3 -
 .../processing_cohere2_vision.py              |   4 -
 .../models/colpali/processing_colpali.py      |   4 -
 .../models/colqwen2/processing_colqwen2.py    |   5 -
 src/transformers/models/csm/processing_csm.py |   4 -
 .../deepseek_vl/processing_deepseek_vl.py     |   3 -
 .../processing_deepseek_vl_hybrid.py          |   3 -
 src/transformers/models/dia/processing_dia.py |   2 -
 .../models/donut/processing_donut.py          |   4 -
 .../models/emu3/processing_emu3.py            |   4 -
 .../models/evolla/processing_evolla.py        |   1 -
 .../models/flava/processing_flava.py          |   4 -
 .../models/florence2/processing_florence2.py  |   4 -
 .../models/fuyu/processing_fuyu.py            |   4 -
 .../models/gemma3/processing_gemma3.py        |   4 -
 .../models/gemma3n/processing_gemma3n.py      |   5 -
 src/transformers/models/git/processing_git.py |   4 -
 .../models/glm4v/processing_glm4v.py          |   6 -
 .../models/got_ocr2/processing_got_ocr2.py    |   4 -
 .../processing_granite_speech.py              |   1 -
 .../processing_grounding_dino.py              |   3 -
 .../models/idefics/processing_idefics.py      |   4 -
 .../models/idefics2/processing_idefics2.py    |   4 -
 .../models/idefics3/processing_idefics3.py    |   4 -
 .../instructblip/processing_instructblip.py   |   3 -
 .../processing_instructblipvideo.py           |   5 -
 .../models/internvl/processing_internvl.py    |   5 -
 .../models/janus/processing_janus.py          |   4 -
 .../models/kosmos2/processing_kosmos2.py      |   4 -
 .../models/kosmos2_5/processing_kosmos2_5.py  |   4 -
 .../processing_kyutai_speech_to_text.py       |   2 -
 .../layoutlmv2/processing_layoutlmv2.py       |   4 -
 .../layoutlmv3/processing_layoutlmv3.py       |   4 -
 .../models/layoutxlm/processing_layoutxlm.py  |   4 -
 .../models/lfm2_vl/processing_lfm2_vl.py      |   4 -
 .../models/llama4/processing_llama4.py        |   4 -
 .../models/llava/processing_llava.py          |   4 -
 .../llava_next/processing_llava_next.py       |   4 -
 .../processing_llava_next_video.py            |   5 -
 .../processing_llava_onevision.py             |   5 -
 .../models/markuplm/processing_markuplm.py    |   2 -
 .../models/mgp_str/processing_mgp_str.py      |   2 -
 .../models/mllama/processing_mllama.py        |   4 -
 .../models/musicgen/processing_musicgen.py    |   3 -
 .../processing_musicgen_melody.py             |   3 -
 .../models/nougat/processing_nougat.py        |   4 -
 .../omdet_turbo/processing_omdet_turbo.py     |   4 -
 .../models/oneformer/processing_oneformer.py  |   4 -
 .../models/ovis2/processing_ovis2.py          |   4 -
 .../models/owlv2/processing_owlv2.py          |   4 -
 .../models/owlvit/processing_owlvit.py        |   4 -
 .../models/paligemma/processing_paligemma.py  |   4 -
 .../models/parakeet/processing_parakeet.py    |   4 -
 .../perception_lm/processing_perception_lm.py |   5 -
 .../processing_phi4_multimodal.py             |   3 -
 .../pix2struct/processing_pix2struct.py       |   4 -
 .../models/pixtral/processing_pixtral.py      |   4 -
 .../models/pop2piano/processing_pop2piano.py  |   4 -
 .../qwen2_5_omni/processing_qwen2_5_omni.py   |   6 -
 .../qwen2_5_vl/processing_qwen2_5_vl.py       |   6 -
 .../qwen2_audio/processing_qwen2_audio.py     |   4 -
 .../models/qwen2_vl/processing_qwen2_vl.py    |   5 -
 .../processing_qwen3_omni_moe.py              |   6 -
 .../models/qwen3_vl/processing_qwen3_vl.py    |   5 -
 src/transformers/models/sam/processing_sam.py |   3 -
 .../models/sam2/processing_sam2.py            |   3 -
 .../sam2_video/processing_sam2_video.py       |   4 -
 .../models/sam_hq/processing_samhq.py         |   3 -
 .../seamless_m4t/processing_seamless_m4t.py   |   2 -
 .../models/siglip/processing_siglip.py        |   4 -
 .../models/siglip2/processing_siglip2.py      |   4 -
 .../models/smolvlm/processing_smolvlm.py      |   5 -
 .../processing_speech_to_text.py              |   3 -
 .../models/speecht5/processing_speecht5.py    |   3 -
 .../models/trocr/processing_trocr.py          |   4 -
 src/transformers/models/tvp/processing_tvp.py |   4 -
 .../models/udop/processing_udop.py            |   4 -
 .../video_llama_3/processing_video_llama_3.py |   5 -
 .../video_llava/processing_video_llava.py     |   5 -
 .../models/vilt/processing_vilt.py            |   3 -
 .../processing_vision_text_dual_encoder.py    |   4 -
 .../models/voxtral/processing_voxtral.py      |   4 -
 .../models/wav2vec2/processing_wav2vec2.py    |   3 -
 .../wav2vec2_bert/processing_wav2vec2_bert.py |   3 -
 .../processing_wav2vec2_with_lm.py            |   3 -
 .../models/whisper/processing_whisper.py      |   3 -
 .../models/x_clip/processing_x_clip.py        |   8 +-
 src/transformers/processing_utils.py          | 157 +++++++++++-------
 .../wav2vec2/test_processing_wav2vec2.py      |   7 +-
 .../test_processing_wav2vec2_bert.py          |   7 +-
 .../test_processing_wav2vec2_with_lm.py       |  16 +-
 tests/test_processing_common.py               | 122 +++++---------
 111 files changed, 213 insertions(+), 535 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index fbca27b2ff39..ac927b8d2306 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -59,9 +59,6 @@ class AlignProcessor(ProcessorMixin):
 
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "EfficientNetImageProcessor"
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
     valid_processor_kwargs = AlignProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 24631ecacbd7..933a5e48dfed 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -35,10 +35,6 @@ class AltCLIPProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
-    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
-
     @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index d0841c96aee2..c29c289649da 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -67,10 +67,6 @@ class AriaProcessor(ProcessorMixin):
             A dictionary indicating size conversions for images.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AriaImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 38f09a5a3ee8..0f0b6660fc36 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -42,6 +42,7 @@
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("clap", "ClapFeatureExtractor"),
         ("clvp", "ClvpFeatureExtractor"),
+        ("csm", "EncodecFeatureExtractor"),
         ("dac", "DacFeatureExtractor"),
         ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
         ("dia", "DiaFeatureExtractor"),
@@ -50,14 +51,21 @@
         ("granite_speech", "GraniteSpeechFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
+        ("markuplm", "MarkupLMFeatureExtractor"),
         ("mctct", "MCTCTFeatureExtractor"),
         ("mimi", "EncodecFeatureExtractor"),
         ("moonshine", "Wav2Vec2FeatureExtractor"),
         ("moshi", "EncodecFeatureExtractor"),
+        ("musicgen", "EncodecFeatureExtractor"),
+        ("musicgen_melody", "MusicgenMelodyFeatureExtractor"),
+        ("parakeet", "ParakeetFeatureExtractor"),
         ("parakeet_ctc", "ParakeetFeatureExtractor"),
         ("parakeet_encoder", "ParakeetFeatureExtractor"),
         ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
         ("pop2piano", "Pop2PianoFeatureExtractor"),
+        ("qwen2_5_omni", "WhisperFeatureExtractor"),
+        ("qwen2_audio", "WhisperFeatureExtractor"),
+        ("qwen3_omni_moe", "WhisperFeatureExtractor"),
         ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
         ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
         ("sew", "Wav2Vec2FeatureExtractor"),
@@ -67,6 +75,7 @@
         ("unispeech", "Wav2Vec2FeatureExtractor"),
         ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
         ("univnet", "UnivNetFeatureExtractor"),
+        ("voxtral", "WhisperFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index f864d107914f..4aeae5571c52 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -62,7 +62,9 @@
             ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
+            ("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("aria", ("AriaImageProcessor", None)),
+            ("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
             ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
             ("bit", ("BitImageProcessor", "BitImageProcessorFast")),
             ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
@@ -73,6 +75,8 @@
             ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")),
+            ("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
+            ("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
             ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
             ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
@@ -94,10 +98,13 @@
             ("edgetam", (None, "Sam2ImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor", None)),
             ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
+            ("emu3", ("Emu3ImageProcessor", None)),
             ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
             ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
             ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
-            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast"))(
+                "focalnet", ("BitImageProcessor", "BitImageProcessorFast")
+            ),
             ("fuyu", ("FuyuImageProcessor", None)),
             ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
@@ -115,11 +122,13 @@
             ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
             ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
             ("instructblipvideo", ("InstructBlipVideoImageProcessor", None)),
+            ("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
             ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
             ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
             ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
             ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
+            ("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")),
             ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
             ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
             ("lightglue", ("LightGlueImageProcessor", None)),
@@ -142,6 +151,7 @@
             ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
             ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
+            ("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
             ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
             ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")),
             ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
@@ -156,14 +166,17 @@
             ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
             ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
             ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
+            ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
             ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
             ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
             ("sam", ("SamImageProcessor", "SamImageProcessorFast")),
             ("sam2", (None, "Sam2ImageProcessorFast")),
+            ("sam2_video", (None, "Sam2ImageProcessorFast")),
             ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")),
             ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
             ("seggpt", ("SegGptImageProcessor", None)),
@@ -181,12 +194,14 @@
             ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
             ("timesformer", ("VideoMAEImageProcessor", None)),
             ("timm_wrapper", ("TimmWrapperImageProcessor", None)),
+            ("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("tvlt", ("TvltImageProcessor", None)),
             ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
             ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
             ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
             ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
             ("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
+            ("video_llava", ("VideoLlavaImageProcessor", None)),
             ("videomae", ("VideoMAEImageProcessor", None)),
             ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
             ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index cb2eb94cecd4..3c2230722ca9 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -108,6 +108,7 @@
         ("mllama", "MllamaProcessor"),
         ("mm-grounding-dino", "GroundingDinoProcessor"),
         ("moonshine", "Wav2Vec2Processor"),
+        ("omdet-turbo", "OmDetTurboProcessor"),
         ("oneformer", "OneFormerProcessor"),
         ("ovis2", "Ovis2Processor"),
         ("owlv2", "Owlv2Processor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ccee9937afa6..ed943b231ead 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -73,6 +73,7 @@
             ),
         ),
         ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
         ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
@@ -157,6 +158,7 @@
         ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
         ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
         ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
+        ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
         ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
         ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
@@ -225,6 +227,7 @@
             ),
         ),
         ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
+        ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
         (
             "dpr",
             (
@@ -239,6 +242,7 @@
         ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
         ("esm", ("EsmTokenizer", None)),
+        ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         (
             "exaone4",
             (
@@ -253,10 +257,13 @@
             ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
         ),
         ("flaubert", ("FlaubertTokenizer", None)),
+        ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("florence2", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
         ("fsmt", ("FSMTTokenizer", None)),
         ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
+        ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         (
             "gemma",
             (
@@ -305,6 +312,7 @@
         ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
         ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
         ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
@@ -315,6 +323,7 @@
         ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
         ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
         ("granite", ("GPT2Tokenizer", None)),
+        ("granite_speech", ("GPT2Tokenizer", None)),
         ("granitemoe", ("GPT2Tokenizer", None)),
         ("granitemoehybrid", ("GPT2Tokenizer", None)),
         ("granitemoeshared", ("GPT2Tokenizer", None)),
@@ -354,11 +363,14 @@
             ),
         ),
         ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("kyutai_speech_to_text", (None, "PretrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
         ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
         ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
+        ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
         (
             "llama",
@@ -399,6 +411,7 @@
         ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
         ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
         ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
+        ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)),
         (
             "mbart",
             (
@@ -485,6 +498,7 @@
                 "NllbTokenizerFast" if is_tokenizers_available() else None,
             ),
         ),
+        ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)),
         (
             "nystromformer",
             (
@@ -506,6 +520,7 @@
             ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
         ),
         ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
         ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
         ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
         ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@@ -531,6 +546,7 @@
                 None,
             ),
         ),
+        ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         (
             "persimmon",
             (
@@ -540,6 +556,7 @@
         ),
         ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
         ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
         ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("phobert", ("PhobertTokenizer", None)),
         ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
@@ -553,6 +570,7 @@
             ),
         ),
         ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
+        ("pop2piano", ("Pop2PianoTokenizer", None)),
         ("prophetnet", ("ProphetNetTokenizer", None)),
         ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         (
@@ -659,6 +677,7 @@
             ),
         ),
         ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("smolvlm", ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
         ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
         ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
@@ -693,6 +712,7 @@
         ("tapas", ("TapasTokenizer", None)),
         ("tapex", ("TapexTokenizer", None)),
         ("transfo-xl", ("TransfoXLTokenizer", None)),
+        ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
         ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         (
             "udop",
@@ -708,9 +728,14 @@
                 "T5TokenizerFast" if is_tokenizers_available() else None,
             ),
         ),
+        ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
         ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "vision_text_dual_encoder",
+            ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
+        ),
         ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("vits", ("VitsTokenizer", None)),
         (
@@ -726,6 +751,7 @@
         ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)),
         ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
         ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
+        ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)),
         ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
         ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
         (
diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
index 78956269331f..c660b2e7a1ff 100644
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@@ -64,6 +64,7 @@
             ("video_llava", "VideoLlavaVideoProcessor"),
             ("videomae", "VideoMAEVideoProcessor"),
             ("vjepa2", "VJEPA2VideoProcessor"),
+            ("video_llama_3", "VideoLlama3VideoProcessor"),  # PLACEHOLDER - needs proper video processor class
         ]
     )
 
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
index 882a85d40946..049b0e5d24eb 100644
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -70,10 +70,6 @@ class AyaVisionProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 0602986483a6..65a1700837d9 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -49,7 +49,6 @@ class BarkProcessor(ProcessorMixin):
 
     """
 
-    tokenizer_class = "AutoTokenizer"
     attributes = ["tokenizer"]
 
     preset_shape = {
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index f600e8ce27d8..965164206c5a 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -53,10 +53,6 @@ class BlipProcessor(ProcessorMixin):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 40729f4f4501..5949e2c648ce 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -60,10 +60,6 @@ class Blip2Processor(ProcessorMixin):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.return_token_type_ids = False
         if not hasattr(tokenizer, "image_token"):
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 030c578c49cd..5de97ec411dc 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -54,9 +54,6 @@ class BridgeTowerProcessor(ProcessorMixin):
             An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "BridgeTowerImageProcessor"
-    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
     valid_processor_kwargs = BridgeTowerProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
index 8de0a1c49b0d..fe58e17b12b6 100644
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@@ -47,7 +47,6 @@ class BrosProcessor(ProcessorMixin):
     """
 
     attributes = ["tokenizer"]
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
     valid_processor_kwargs = BrosProcessorKwargs
 
     def __init__(self, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 247f72322a2d..694be7ab8f26 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -69,10 +69,6 @@ class ChameleonProcessor(ProcessorMixin):
             The special token used to indicate image in the text.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
-    image_processor_class = "ChameleonImageProcessor"
-
     def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 0510b9b0f3c9..6508136f772e 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -34,10 +34,6 @@ class ChineseCLIPProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 6524a8715841..a72151cb9b63 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -42,9 +42,6 @@ class ClapProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    feature_extractor_class = "ClapFeatureExtractor"
-    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 7b856f9981ee..9258d2e8fee3 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -33,10 +33,6 @@ class CLIPProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 39e091106c71..4d431181cb4f 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -34,10 +34,6 @@ class CLIPSegProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py
index 8fad43cd2f30..331589a23999 100644
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@@ -38,9 +38,6 @@ class ClvpProcessor(ProcessorMixin):
             An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "ClvpFeatureExtractor"
-    tokenizer_class = "ClvpTokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
index d4fcec4da875..b34fd1c5594e 100644
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -47,10 +47,6 @@ class Cohere2VisionProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index cd33607a35fd..1ad511ced7a7 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -93,10 +93,6 @@ class ColPaliProcessor(ProcessorMixin):
             A prefix to be used for the query.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
-    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index 2eb9fed873a8..00f00c920856 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -64,11 +64,6 @@ class ColQwen2Processor(ProcessorMixin):
         query_prefix (`str`, *optional*): A prefix to be used for the query.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py
index 172016f6431d..d77ffeffd896 100644
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -95,10 +95,6 @@ class CsmProcessor(ProcessorMixin):
 
     """
 
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "EncodecFeatureExtractor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-
     def __init__(
         self,
         feature_extractor,
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
index ddeb4f799ee1..0ebf46ac5ad0 100644
--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -52,10 +52,7 @@ class DeepseekVLProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "num_image_tokens"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
index d20fa495f9b8..9049495932f6 100644
--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -52,10 +52,7 @@ class DeepseekVLHybridProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "num_image_tokens"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index 6518b5444639..23c04687308c 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -77,8 +77,6 @@ class DiaProcessor(ProcessorMixin):
             An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
     """
 
-    feature_extractor_class = "DiaFeatureExtractor"
-    tokenizer_class = "DiaTokenizer"
     audio_tokenizer_class = "DacModel"
 
     def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 65ca58bcf781..fedd173117eb 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -48,10 +48,6 @@ class DonutProcessor(ProcessorMixin):
             An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index b7ed8e9074f0..52f39a913c54 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -64,10 +64,6 @@ class Emu3Processor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
-    image_processor_class = "Emu3ImageProcessor"
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 3be0e07364a6..464ded903105 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -52,7 +52,6 @@ class EvollaProcessor(ProcessorMixin):
     # protein_tokenizer_class = "EsmTokenizer"
     # tokenizer_class = "LlamaTokenizerFast"
     protein_tokenizer_class = "AutoTokenizer"
-    tokenizer_class = "AutoTokenizer"
     protein_tokenizer_dir_name = "protein_tokenizer"
     # tokenizer_dir_name = "text_tokenizer"
 
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 272fb01d7b7a..7e5b3c0e012e 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -31,10 +31,6 @@ class FlavaProcessor(ProcessorMixin):
         tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "FlavaImageProcessor"
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py
index 1c25ddceeafc..c8d699e4bc3e 100644
--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@@ -62,10 +62,6 @@ class Florence2Processor(ProcessorMixin):
             thresholds, or banned tokens.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index a715ce412313..ee697deccf9e 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -347,10 +347,6 @@ class FuyuProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "FuyuImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor=image_processor, tokenizer=tokenizer)
         self.image_processor = image_processor
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
index a9bac5b69e47..11574e30b7c1 100644
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -42,10 +42,6 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
 
 
 class Gemma3Processor(ProcessorMixin):
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py
index 913336b8d3f5..51b686557ed0 100644
--- a/src/transformers/models/gemma3n/processing_gemma3n.py
+++ b/src/transformers/models/gemma3n/processing_gemma3n.py
@@ -51,11 +51,6 @@ class Gemma3nProcessor(ProcessorMixin):
             The number of image soft tokens that should be added to
     """
 
-    attributes = ["feature_extractor", "image_processor", "tokenizer"]
-    feature_extractor_class = "AutoFeatureExtractor"
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         feature_extractor,
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 2eba7c68f584..89cfc9618987 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -33,10 +33,6 @@ class GitProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index e8f9c948c66d..79935cbde7b4 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -59,12 +59,6 @@ class Glm4vProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-
-    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 1843b7f28830..162efef5e9f9 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -93,10 +93,6 @@ class GotOcr2Processor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 0b76ccfe75db..b3d3a8719185 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -32,7 +32,6 @@
 class GraniteSpeechProcessor(ProcessorMixin):
     attributes = ["audio_processor", "tokenizer"]
     audio_processor_class = "GraniteSpeechFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 5f2f900451b2..74565588d852 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -130,9 +130,6 @@ class GroundingDinoProcessor(ProcessorMixin):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "GroundingDinoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
     valid_processor_kwargs = GroundingDinoProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index b0ad20df386b..7cb640e56854 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -153,10 +153,6 @@ class IdeficsProcessor(ProcessorMixin):
             The string representation of token representing end of utterance
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "IdeficsImageProcessor"
-    tokenizer_class = "LlamaTokenizerFast"
-
     def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.image_token_id = (
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index c419a3641254..df5f9ca73a8b 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -75,10 +75,6 @@ class Idefics2Processor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Idefics2ImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
     ):
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 451af1d8a38f..373e3e3ed9f3 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -121,10 +121,6 @@ class Idefics3Processor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Idefics3ImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
     ):
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index afe43c1fc7a7..2ae4a0a8a229 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -65,9 +65,6 @@ class InstructBlipProcessor(ProcessorMixin):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
-    attributes = ["image_processor", "tokenizer", "qformer_tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
     qformer_tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index ee4e843e2f33..f609b3e1be0c 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -55,11 +55,6 @@ class InstructBlipVideoProcessor(ProcessorMixin):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
-    attributes = ["video_processor", "tokenizer", "qformer_tokenizer"]
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = "AutoTokenizer"
-    qformer_tokenizer_class = "AutoTokenizer"
-
     def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
         if not hasattr(tokenizer, "video_token"):
             self.video_token = AddedToken("<video>", normalized=False, special=True)
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index 12e0d395b05c..fd2a52a768ab 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -58,11 +58,6 @@ class InternVLProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index 15c237c4ced4..354570314a78 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -64,10 +64,6 @@ class JanusProcessor(ProcessorMixin):
             Use default system prompt for Text Generation.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "JanusImageProcessor"
-    tokenizer_class = "LlamaTokenizerFast"
-
     def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs):
         self.num_image_tokens = 576
         self.image_token = tokenizer.image_token
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index f9fb98df6ac2..d6fd1e6ec758 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -85,10 +85,6 @@ class Kosmos2Processor(ProcessorMixin):
             The number of tokens that represent patch indices.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
         tokenizer.return_token_type_ids = False
 
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index cb6f27777a0f..5d1ec20c75de 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -61,10 +61,6 @@ class Kosmos2_5Processor(ProcessorMixin):
             Number of image tokens used as a placeholder.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-
     def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
         self.image_end_token = tokenizer.eoi_token  # "</image>" : fixed token for the end of image
diff --git a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
index 91a588857377..8670fc473c0c 100644
--- a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@@ -34,8 +34,6 @@ class KyutaiSpeechToTextProcessor(ProcessorMixin):
     information.
     """
 
-    feature_extractor_class = "KyutaiSpeechToTextFeatureExtractor"
-    tokenizer_class = "PreTrainedTokenizerFast"
     valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
 
 
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 0a6ea0d20030..0f3e7dc8a9d9 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -43,10 +43,6 @@ class LayoutLMv2Processor(ProcessorMixin):
             An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv2ImageProcessor"
-    tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index f4a2906b5985..5f7de3dd9147 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -43,10 +43,6 @@ class LayoutLMv3Processor(ProcessorMixin):
             An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv3ImageProcessor"
-    tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 8e6ef52f6c0f..887d150ab366 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -43,10 +43,6 @@ class LayoutXLMProcessor(ProcessorMixin):
             An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv2ImageProcessor"
-    tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 311dfdc3b123..73038b9f37aa 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -64,10 +64,6 @@ class Lfm2VlProcessor(ProcessorMixin):
             A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Lfm2VlImageProcessorFast"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
index df371bdfd710..c9ad6884fa8d 100644
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -68,10 +68,6 @@ class Llama4Processor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 6f8d9e3a14cc..a11e80280b74 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -67,10 +67,6 @@ class LlavaProcessor(ProcessorMixin):
             extra tokens appended, no need to set this arg.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 04493518a020..d79c5a0edf6b 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -74,10 +74,6 @@ class LlavaNextProcessor(ProcessorMixin):
             extra tokens appended, no need to set this arg.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 015e4cdea6df..3ff3a140ffa3 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -77,11 +77,6 @@ class LlavaNextVideoProcessor(ProcessorMixin):
 
     # video and image processor share same args, but have different processing logic
     # only image processor config is saved in the hub
-    attributes = ["video_processor", "image_processor", "tokenizer"]
-    image_processor_class = ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
-
     def __init__(
         self,
         video_processor=None,
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index ff8eae5dd87a..4ea891e50cf1 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -74,11 +74,6 @@ class LlavaOnevisionProcessor(ProcessorMixin):
             Aspect ratio used when processong image features. The default value is "anyres_max_9".
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-    video_processor_class = "AutoVideoProcessor"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index c6208b07bbea..be0dd9273d5a 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -43,8 +43,6 @@ class MarkupLMProcessor(ProcessorMixin):
             Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
     """
 
-    feature_extractor_class = "MarkupLMFeatureExtractor"
-    tokenizer_class = ("MarkupLMTokenizer", "MarkupLMTokenizerFast")
     parse_html = True
 
     def __call__(
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 349e075c7996..11f65f8cabd5 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -50,8 +50,6 @@ class MgpstrProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "char_tokenizer"]
-    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
     char_tokenizer_class = "MgpstrTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 53bf4cc210a0..7c1148f19cf3 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -198,10 +198,6 @@ class MllamaProcessor(ProcessorMixin):
 
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "MllamaImageProcessor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-
     def __init__(self, image_processor, tokenizer, chat_template=None):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = "<|image|>"
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
index 030013d34f98..228253e20993 100644
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -39,9 +39,6 @@ class MusicgenProcessor(ProcessorMixin):
             An instance of [`T5Tokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "EncodecFeatureExtractor"
-    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
index 1c2bbcb6e4a8..49092f80cd45 100644
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@@ -41,9 +41,6 @@ class MusicgenMelodyProcessor(ProcessorMixin):
             An instance of [`T5Tokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "MusicgenMelodyFeatureExtractor"
-    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index b0b6d665bd22..4e071dbd8109 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -38,10 +38,6 @@ class NougatProcessor(ProcessorMixin):
             An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 842fe5d9bddf..3601655e3e99 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -215,10 +215,6 @@ class OmDetTurboProcessor(ProcessorMixin):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("DetrImageProcessor", "DetrImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
index de5a0474e26a..ec90d63f7bd7 100644
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -41,10 +41,6 @@ class OneFormerProcessor(ProcessorMixin):
             Sequence length for input task token.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "OneFormerImageProcessor"
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
     def __init__(
         self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
     ):
diff --git a/src/transformers/models/ovis2/processing_ovis2.py b/src/transformers/models/ovis2/processing_ovis2.py
index 82cd686682a9..f67657c140d8 100644
--- a/src/transformers/models/ovis2/processing_ovis2.py
+++ b/src/transformers/models/ovis2/processing_ovis2.py
@@ -54,10 +54,6 @@ class Ovis2Processor(ProcessorMixin):
             The number of image tokens to be used for each image in the input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 65f111e2ca79..52889721820f 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -66,10 +66,6 @@ class Owlv2Processor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index e6715e9688b9..0443ab64eda9 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -66,10 +66,6 @@ class OwlViTProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "OwlViTImageProcessor"
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 7fa636ab796b..0c28f0eb4631 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -108,10 +108,6 @@ class PaliGemmaProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
-    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/parakeet/processing_parakeet.py b/src/transformers/models/parakeet/processing_parakeet.py
index 20b86a28393b..5ee766e1fa0e 100644
--- a/src/transformers/models/parakeet/processing_parakeet.py
+++ b/src/transformers/models/parakeet/processing_parakeet.py
@@ -39,10 +39,6 @@ class ParakeetProcessorKwargs(ProcessingKwargs, total=False):
 
 
 class ParakeetProcessor(ProcessorMixin):
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "ParakeetFeatureExtractor"
-    tokenizer_class = "ParakeetTokenizerFast"
-
     def __call__(
         self,
         audio: AudioInput,
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 41016765c1fd..412996873807 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -62,11 +62,6 @@ class PerceptionLMProcessor(ProcessorMixin):
             Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
     """
 
-    attributes = ["video_processor", "image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         video_processor=None,
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
index 4a1af1d6bb78..8eec69b0448e 100644
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -58,9 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
             The fake audio token pattern.
     """
 
-    attributes = ["image_processor", "audio_processor", "tokenizer"]
-    tokenizer_class = "GPT2TokenizerFast"
-    image_processor_class = "Phi4MultimodalImageProcessorFast"
     audio_processor_class = "Phi4MultimodalFeatureExtractor"
 
     def __init__(
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index 25667f09ad81..b7446cb69684 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
             An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Pix2StructImageProcessor"
-    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
-
     def __init__(self, image_processor, tokenizer):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 5bb9fd780328..b62deee98300 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -87,10 +87,6 @@ class PixtralProcessor(ProcessorMixin):
             Special token used to denote the end of an image input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
index bb914b3f8710..a68168e36739 100644
--- a/src/transformers/models/pop2piano/processing_pop2piano.py
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -42,10 +42,6 @@ class Pop2PianoProcessor(ProcessorMixin):
             An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
     """
 
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "Pop2PianoFeatureExtractor"
-    tokenizer_class = "Pop2PianoTokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index ea60155999e6..2d718e3f7d3a 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -89,12 +89,6 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
             The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
     """
 
-    attributes = ["image_processor", "video_processor", "feature_extractor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 8d249fb2d51c..b21cea27b337 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -60,12 +60,6 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index e591d5fb8d91..449480df4588 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -57,10 +57,6 @@ class Qwen2AudioProcessor(ProcessorMixin):
             The token to use for audio eos tokens.
     """
 
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         feature_extractor=None,
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index f630d039edbd..e9487a8197bf 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -61,11 +61,6 @@ class Qwen2VLProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index 9f3a894c114d..9ea3fd7af47b 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -102,12 +102,6 @@ class Qwen3OmniMoeProcessor(ProcessorMixin):
             The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
     """
 
-    attributes = ["image_processor", "video_processor", "feature_extractor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
index d8d0cc11ffa5..6da944f9cb8c 100644
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -60,11 +60,6 @@ class Qwen3VLProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 5c987a809bf8..cda5d7ed5aeb 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -65,9 +65,6 @@ class SamProcessor(ProcessorMixin):
             An instance of [`SamImageProcessor`]. The image processor is a required input.
     """
 
-    attributes = ["image_processor"]
-    image_processor_class = "SamImageProcessor"
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         self.target_size = self.image_processor.size["longest_edge"]
diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py
index a2d90581ec70..21a5f9dc5913 100644
--- a/src/transformers/models/sam2/processing_sam2.py
+++ b/src/transformers/models/sam2/processing_sam2.py
@@ -52,9 +52,6 @@ class Sam2Processor(ProcessorMixin):
             The value used for padding input points.
     """
 
-    attributes = ["image_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
-
     def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
         super().__init__(image_processor, **kwargs)
         self.point_pad_value = point_pad_value
diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py
index 8e09ee23b9a4..839449ba505d 100644
--- a/src/transformers/models/sam2_video/processing_sam2_video.py
+++ b/src/transformers/models/sam2_video/processing_sam2_video.py
@@ -53,10 +53,6 @@ class Sam2VideoProcessor(ProcessorMixin):
             The value used for padding input points.
     """
 
-    attributes = ["image_processor", "video_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
-    video_processor_class = "Sam2VideoVideoProcessor"
-
     def __init__(
         self, image_processor, video_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs
     ):
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 09d6af430c02..1434a9ca5a2d 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -65,9 +65,6 @@ class SamHQProcessor(ProcessorMixin):
             An instance of [`SamImageProcessor`]. The image processor is a required input.
     """
 
-    attributes = ["image_processor"]
-    image_processor_class = "SamImageProcessor"
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         # Ensure image_processor is properly initialized
diff --git a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
index 85e1968d7d43..a506d81af61d 100644
--- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
@@ -54,8 +54,6 @@ class SeamlessM4TProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    feature_extractor_class = "SeamlessM4TFeatureExtractor"
-    tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
     valid_processor_kwargs = SeamlessM4TProcessorKwargs
 
     def __init__(self, feature_extractor, tokenizer):
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index f0284ae66670..2d63eacd1747 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -33,10 +33,6 @@ class SiglipProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index b16650303da4..fe33ad11dbe7 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -47,10 +47,6 @@ class Siglip2Processor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
     valid_processor_kwargs = Siglip2ProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 86d07e238f1b..9d1d64a65efe 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -141,11 +141,6 @@ class SmolVLMProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "SmolVLMImageProcessor"
-    video_processor_class = "SmolVLMVideoProcessor"  # NOTE: uses different interpolation than slow processors
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 3cfe31c6547a..ffcb4e3d4497 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -37,9 +37,6 @@ class Speech2TextProcessor(ProcessorMixin):
             An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "Speech2TextFeatureExtractor"
-    tokenizer_class = "Speech2TextTokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/speecht5/processing_speecht5.py b/src/transformers/models/speecht5/processing_speecht5.py
index 6be19f8ae9ed..bfac305ab641 100644
--- a/src/transformers/models/speecht5/processing_speecht5.py
+++ b/src/transformers/models/speecht5/processing_speecht5.py
@@ -31,9 +31,6 @@ class SpeechT5Processor(ProcessorMixin):
             An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "SpeechT5FeatureExtractor"
-    tokenizer_class = "SpeechT5Tokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 6f5a8b6cb0bc..366bb0850d2d 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -43,10 +43,6 @@ class TrOCRProcessor(ProcessorMixin):
             An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 7cec0f14ab76..259246962d27 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -44,10 +44,6 @@ class TvpProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "TvpImageProcessor"
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = image_processor
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index c44fa3d504ea..5e37a021e6ab 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -73,10 +73,6 @@ class UdopProcessor(ProcessorMixin):
             An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv3ImageProcessor"
-    tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py
index 37127d736053..d5ea2c75e9d8 100644
--- a/src/transformers/models/video_llama_3/processing_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py
@@ -57,11 +57,6 @@ class VideoLlama3Processor(ProcessorMixin):
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
     """
 
-    attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "AutoImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 927d662fb587..8d6d916834e8 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -60,11 +60,6 @@ class VideoLlavaProcessor(ProcessorMixin):
             extra tokens appended, no need to set this arg.
     """
 
-    attributes = ["image_processor", "video_processor", "tokenizer"]
-    image_processor_class = "VideoLlavaImageProcessor"
-    video_processor_class = "AutoVideoProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index ceda264a5345..26738d890d65 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -48,9 +48,6 @@ class ViltProcessor(ProcessorMixin):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "ViltImageProcessor"
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
     valid_processor_kwargs = ViltProcessorKwargs
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index b98f2ac3f373..cc1cd01e50c6 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -39,10 +39,6 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index 47fe00bf3e9f..b33407afbd4e 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -74,10 +74,6 @@ class VoxtralProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["feature_extractor", "tokenizer"]
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = "MistralCommonTokenizer"
-
     def __init__(
         self,
         feature_extractor,
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 642151e24fed..8a8b7ded7116 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -44,9 +44,6 @@ class Wav2Vec2Processor(ProcessorMixin):
             An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "Wav2Vec2FeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index fc95fc04c754..90da8b651677 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -44,9 +44,6 @@ class Wav2Vec2BertProcessor(ProcessorMixin):
             An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "SeamlessM4TFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index f5605ea6c5b5..71973334dfd6 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -80,9 +80,6 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
             An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
     """
 
-    feature_extractor_class = "AutoFeatureExtractor"
-    tokenizer_class = "Wav2Vec2CTCTokenizer"
-
     def __init__(
         self,
         feature_extractor: "FeatureExtractionMixin",
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index ece78b2f10b1..e71a7a545281 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -34,9 +34,6 @@ class WhisperProcessor(ProcessorMixin):
             An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
     """
 
-    feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 2110a783bb37..ae31cd075b8a 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -23,20 +23,16 @@ class XCLIPProcessor(ProcessorMixin):
     r"""
     Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
 
-    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
+    [`XCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
     [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`VideoMAEImageProcessor`], *optional*):
+        image_processor ([`CLIPImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "VideoMAEImageProcessor"
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = self.image_processor
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 55844c8d9cce..91449d26922c 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -36,24 +36,6 @@
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature
 from .image_utils import ChannelDimension, ImageInput, is_vision_available
-from .utils.chat_template_utils import render_jinja_template
-from .utils.type_validators import (
-    device_validator,
-    image_size_validator,
-    padding_validator,
-    positive_any_number,
-    positive_int,
-    resampling_validator,
-    tensor_type_validator,
-    truncation_validator,
-    video_metadata_validator,
-)
-from .video_utils import VideoInput, VideoMetadataType
-
-
-if is_vision_available():
-    from .image_utils import PILImageResampling
-
 from .tokenization_utils_base import (
     PaddingStrategy,
     PreTokenizedInput,
@@ -79,12 +61,27 @@
     list_repo_templates,
     logging,
 )
+from .utils.chat_template_utils import render_jinja_template
 from .utils.deprecation import deprecate_kwarg
+from .utils.type_validators import (
+    device_validator,
+    image_size_validator,
+    padding_validator,
+    positive_any_number,
+    positive_int,
+    resampling_validator,
+    tensor_type_validator,
+    truncation_validator,
+    video_metadata_validator,
+)
+from .video_utils import VideoInput, VideoMetadataType
 
 
 if is_torch_available():
     from .modeling_utils import PreTrainedAudioTokenizerBase
 
+if is_vision_available():
+    from .image_utils import PILImageResampling
 
 logger = logging.get_logger(__name__)
 
@@ -95,11 +92,49 @@
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
 
+class _LazyAutoProcessorMapping(dict):
+    """
+    Lazy dictionary to avoid circular imports.
+    The mapping names are only imported when accessed.
+    """
+
+    _MAPPING_NAMES = {
+        "image_processor": ("transformers.models.auto.image_processing_auto", "AutoImageProcessor"),
+        "video_processor": ("transformers.models.auto.video_processing_auto", "AutoVideoProcessor"),
+        "feature_extractor": (
+            "transformers.models.auto.feature_extraction_auto",
+            "AutoFeatureExtractor",
+        ),
+        "tokenizer": ("transformers.models.auto.tokenization_auto", "AutoTokenizer"),
+    }
+
+    def __getitem__(self, key):
+        if key not in self._MAPPING_NAMES:
+            raise KeyError(key)
+        module_name, attr_name = self._MAPPING_NAMES[key]
+        module = __import__(module_name, fromlist=[attr_name])
+        return getattr(module, attr_name)
+
+    def __contains__(self, key):
+        return key in self._MAPPING_NAMES
+
+    def keys(self):
+        return self._MAPPING_NAMES.keys()
+
+
+MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping()
+
 AUTO_TO_BASE_CLASS_MAPPING = {
-    "AutoTokenizer": "PreTrainedTokenizerBase",
-    "AutoFeatureExtractor": "FeatureExtractionMixin",
-    "AutoImageProcessor": "ImageProcessingMixin",
-    "AutoVideoProcessor": "BaseVideoProcessor",
+    "audio_tokenizer": "DacModel",
+    "tokenizer": "PreTrainedTokenizerBase",
+    "feature_extractor": "FeatureExtractionMixin",
+    "image_processor": "ImageProcessingMixin",
+    "video_processor": "BaseVideoProcessor",
+}
+
+SPECIAL_MODULE_TO_MODEL_NAME_MAPPING = {
+    "kosmos2_5": "kosmos-2.5",
+    "kosmos2": "kosmos-2",
 }
 
 if sys.version_info >= (3, 11):
@@ -523,9 +558,7 @@ class ProcessorMixin(PushToHubMixin):
     This is a mixin used to provide saving/loading functionality for all processor classes.
     """
 
-    attributes = ["feature_extractor", "tokenizer"]
     optional_attributes = ["chat_template", "audio_tokenizer"]
-    optional_call_args: list[str] = []
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
     tokenizer_class = None
@@ -552,22 +585,24 @@ def __init__(self, *args, **kwargs):
 
         # Sanitize args and kwargs
         for key in kwargs:
-            if key not in self.attributes:
+            if key not in self.get_attributes():
                 raise TypeError(f"Unexpected keyword argument {key}.")
-        for arg, attribute_name in zip(args, self.attributes):
+        for arg, attribute_name in zip(args, self.get_attributes()):
             if attribute_name in kwargs:
                 raise TypeError(f"Got multiple values for argument {attribute_name}.")
             else:
                 kwargs[attribute_name] = arg
 
-        if len(kwargs) != len(self.attributes):
+        if len(kwargs) != len(self.get_attributes()):
             raise ValueError(
-                f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
+                f"This processor requires {len(self.get_attributes())} arguments: {', '.join(self.get_attributes())}. Got "
                 f"{len(args)} arguments instead."
             )
 
         # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
         for attribute_name, arg in kwargs.items():
+            print("attribute_name", attribute_name)
+
             self.check_argument_for_proper_class(attribute_name, arg)
             setattr(self, attribute_name, arg)
 
@@ -622,7 +657,7 @@ def __call__(
             "feature_extractor": (audio, "audio_kwargs"),
         }
         outputs = {}
-        for attribute_name in self.attributes:
+        for attribute_name in self.get_attributes():
             attribute = getattr(self, attribute_name, None)
             input_data, input_kwargs = attribute_to_kwargs[attribute_name]
             if input_data is not None and attribute is not None:
@@ -637,9 +672,9 @@ def check_argument_for_proper_class(self, argument_name, argument):
         mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
         is returned.
         """
-        class_name = getattr(self, f"{argument_name}_class")
-        # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
-        class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
+        if argument_name not in AUTO_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
+            argument_name = "tokenizer"
+        class_name = AUTO_TO_BASE_CLASS_MAPPING.get(argument_name)
         if isinstance(class_name, tuple):
             proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
         else:
@@ -670,7 +705,7 @@ def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
 
         if legacy_serialization:
             # Don't save attributes like `tokenizer`, `image processor` etc. in processor config if `legacy=True`
-            attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
+            attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.get_attributes()]
 
         if "tokenizer" in output:
             del output["tokenizer"]
@@ -743,7 +778,7 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serializa
             writer.write(self.to_json_string(legacy_serialization=legacy_serialization))
 
     def __repr__(self):
-        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
+        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.get_attributes()]
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
 
@@ -798,14 +833,14 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
         # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
         # loaded from the Hub.
         if self._auto_class is not None:
-            attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
+            attrs = [getattr(self, attribute_name) for attribute_name in self.get_attributes()]
             configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
             configs.append(self)
             custom_object_save(self, save_directory, config=configs)
 
         save_jinja_files = kwargs.get("save_jinja_files", True)
 
-        for attribute_name in self.attributes:
+        for attribute_name in self.get_attributes():
             # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
             if attribute_name == "tokenizer":
                 attribute = getattr(self, attribute_name)
@@ -823,7 +858,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
 
         if self._auto_class is not None:
             # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
-            for attribute_name in self.attributes:
+            for attribute_name in self.get_attributes():
                 attribute = getattr(self, attribute_name)
                 if isinstance(attribute, PreTrainedTokenizerBase):
                     del attribute.init_kwargs["auto_map"]
@@ -1170,7 +1205,7 @@ def get_processor_dict(
             )
 
         # Pop attributes if saved in a single processor dict, they are loaded in `_get_arguments_from_pretrained`
-        for attribute in cls.attributes:
+        for attribute in cls.get_attributes():
             processor_dict.pop(attribute, None)
 
         return processor_dict, kwargs
@@ -1465,6 +1500,18 @@ def from_pretrained(
         processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
         return cls.from_args_and_dict(args, processor_dict, **kwargs)
 
+    @classmethod
+    def get_attributes(cls):
+        args_in_init = inspect.signature(cls.__init__).parameters.keys()
+        attributes = []
+        for sub_processor_type in args_in_init:
+            # don't treat audio_tokenizer as an attribute
+            if sub_processor_type == "audio_tokenizer":
+                continue
+            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
+                attributes.append(sub_processor_type)
+        return attributes
+
     @classmethod
     def register_for_auto_class(cls, auto_class="AutoProcessor"):
         """
@@ -1498,29 +1545,15 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
         will be unable to find the relevant subcomponent class and will raise an error.
         """
         args = []
-        for attribute_name in cls.attributes:
-            class_name = getattr(cls, f"{attribute_name}_class")
-            if isinstance(class_name, tuple):
-                classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
-                if attribute_name == "image_processor":
-                    # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                    use_fast = kwargs.get("use_fast")
-                    if use_fast is None:
-                        logger.warning_once(
-                            "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                            "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
-                            "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
-                        )
-                else:
-                    use_fast = kwargs.get("use_fast", True)
-                if use_fast and classes[1] is not None:
-                    attribute_class = classes[1]
-                else:
-                    attribute_class = classes[0]
-            else:
-                attribute_class = cls.get_possibly_dynamic_module(class_name)
-
-            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        # get args from processor init signature
+        sub_processors = cls.get_attributes()
+        for sub_processor_type in sub_processors:
+            if sub_processor_type not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in sub_processor_type:
+                sub_processor_type = "tokenizer"
+            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
+                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
+                print("auto_processor_class", auto_processor_class)
+                args.append(auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args
 
@@ -1570,7 +1603,7 @@ def decode(self, *args, **kwargs):
     @property
     def model_input_names(self):
         model_input_names = []
-        for attribute_name in self.attributes:
+        for attribute_name in self.get_attributes():
             attribute = getattr(self, attribute_name, None)
             attr_input_names = getattr(attribute, "model_input_names")
             model_input_names.extend(attr_input_names)
diff --git a/tests/models/wav2vec2/test_processing_wav2vec2.py b/tests/models/wav2vec2/test_processing_wav2vec2.py
index dc9ae4136315..9d625624db46 100644
--- a/tests/models/wav2vec2/test_processing_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processing_wav2vec2.py
@@ -61,6 +61,8 @@ def setUpClass(cls):
 
         tokenizer = cls.get_tokenizer()
         tokenizer.save_pretrained(cls.tmpdirname)
+        feature_extractor = cls.get_feature_extractor()
+        feature_extractor.save_pretrained(cls.tmpdirname)
 
     @classmethod
     def get_tokenizer(cls, **kwargs_init):
@@ -68,8 +70,9 @@ def get_tokenizer(cls, **kwargs_init):
         kwargs.update(kwargs_init)
         return Wav2Vec2CTCTokenizer.from_pretrained(cls.tmpdirname, **kwargs)
 
-    def get_feature_extractor(self, **kwargs):
-        return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    def get_feature_extractor(cls, **kwargs):
+        return Wav2Vec2FeatureExtractor.from_pretrained(cls.tmpdirname, **kwargs)
 
     @classmethod
     def tearDownClass(cls):
diff --git a/tests/models/wav2vec2_bert/test_processing_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_processing_wav2vec2_bert.py
index d05d3e2d44e0..bc4a6be3996f 100644
--- a/tests/models/wav2vec2_bert/test_processing_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_processing_wav2vec2_bert.py
@@ -62,6 +62,8 @@ def setUpClass(cls):
 
         tokenizer = cls.get_tokenizer()
         tokenizer.save_pretrained(cls.tmpdirname)
+        feature_extractor = cls.get_feature_extractor()
+        feature_extractor.save_pretrained(cls.tmpdirname)
 
     @classmethod
     def get_tokenizer(cls, **kwargs_init):
@@ -69,8 +71,9 @@ def get_tokenizer(cls, **kwargs_init):
         kwargs.update(kwargs_init)
         return Wav2Vec2CTCTokenizer.from_pretrained(cls.tmpdirname, **kwargs)
 
-    def get_feature_extractor(self, **kwargs):
-        return SeamlessM4TFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    def get_feature_extractor(cls, **kwargs):
+        return SeamlessM4TFeatureExtractor.from_pretrained(cls.tmpdirname, **kwargs)
 
     @classmethod
     def tearDownClass(cls):
diff --git a/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
index 705fe30bba38..371d91dd1624 100644
--- a/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
@@ -75,14 +75,20 @@ def setUp(self):
 
         # load decoder from hub
         self.decoder_name = "hf-internal-testing/ngram-beam-search-decoder"
+        feature_extractor = self.get_feature_extractor()
+        feature_extractor.save_pretrained(self.tmpdirname)
+        decoder = self.get_decoder()
+        decoder.save_to_dir(self.tmpdirname)
 
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
+    @classmethod
+    def get_tokenizer(cls, **kwargs_init):
+        kwargs = cls.add_kwargs_tokens_map.copy()
         kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+        return Wav2Vec2CTCTokenizer.from_pretrained(cls.tmpdirname, **kwargs)
 
-    def get_feature_extractor(self, **kwargs):
-        return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    def get_feature_extractor(cls, **kwargs):
+        return Wav2Vec2FeatureExtractor.from_pretrained(cls.tmpdirname, **kwargs)
 
     def get_decoder(self, **kwargs):
         return BeamSearchDecoderCTC.load_from_hf_hub(self.decoder_name, **kwargs)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 295ee03a769e..fdf60f373c7f 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -25,8 +25,10 @@
 from huggingface_hub import hf_hub_download
 from parameterized import parameterized
 
-from transformers.models.auto.processing_auto import processor_class_from_name
-from transformers.processing_utils import Unpack
+from transformers.processing_utils import (
+    MODALITY_TO_AUTOPROCESSOR_MAPPING,
+    Unpack,
+)
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
     require_av,
@@ -64,7 +66,6 @@
     ],
 }
 
-
 for modality, urls in MODALITY_INPUT_DATA.items():
     MODALITY_INPUT_DATA[modality] = [url_to_local_path(url) for url in urls]
 
@@ -105,17 +106,11 @@ def prepare_processor_dict():
         return {}
 
     def get_component(self, attribute, **kwargs):
-        assert attribute in self.processor_class.attributes
-        component_class_name = getattr(self.processor_class, f"{attribute}_class")
-        if isinstance(component_class_name, tuple):
-            if attribute == "image_processor":
-                # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                component_class_name = component_class_name[0]
-            else:
-                component_class_name = component_class_name[-1]
-
-        component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        # determine from current file name
+        if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
+            attribute = "tokenizer"
+        auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
+        component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
         if "tokenizer" in attribute and not component.pad_token:
             component.pad_token = "[TEST_PAD]"
             if component.pad_token_id is None:
@@ -125,7 +120,7 @@ def get_component(self, attribute, **kwargs):
 
     def prepare_components(self):
         components = {}
-        for attribute in self.processor_class.attributes:
+        for attribute in self.processor_class.get_attributes():
             component = self.get_component(attribute)
             components[attribute] = component
 
@@ -210,7 +205,7 @@ def test_processor_from_and_save_pretrained(self):
 
                 self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
 
-                for attribute in processor_first.attributes:
+                for attribute in processor_first.get_attributes():
                     attribute_first = getattr(processor_first, attribute)
                     attribute_second = getattr(processor_second, attribute)
 
@@ -231,17 +226,9 @@ def test_processor_from_and_save_pretrained_as_nested_dict(self):
             self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
 
             # Try to load each attribute separately from saved directory
-            for attribute in processor_first.attributes:
-                attribute_class_name = getattr(processor_first, f"{attribute}_class")
-                if isinstance(attribute_class_name, tuple):
-                    if attribute == "image_processor":
-                        # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                        attribute_class_name = attribute_class_name[0]
-                    else:
-                        attribute_class_name = attribute_class_name[-1]
-
-                attribute_class = processor_class_from_name(attribute_class_name)
-                attribute_reloaded = attribute_class.from_pretrained(tmpdirname)
+            for attribute in processor_first.get_attributes():
+                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
+                attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname)
                 attribute_first = getattr(processor_first, attribute)
 
                 # tokenizer repr contains model-path from where we loaded
@@ -368,7 +355,7 @@ def skip_processor_without_typed_kwargs(self, processor):
             self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")
 
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -387,7 +374,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -406,7 +393,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
@@ -422,7 +409,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
 
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -443,7 +430,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -466,7 +453,7 @@ def test_unstructured_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -492,7 +479,7 @@ def test_unstructured_kwargs_batched(self):
         )
 
     def test_doubly_passed_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -511,7 +498,7 @@ def test_doubly_passed_kwargs(self):
             )
 
     def test_args_overlap_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_first = self.get_processor()
         image_processor = processor_first.image_processor
@@ -523,7 +510,7 @@ def test_args_overlap_kwargs(self):
             self.assertTrue(processor_second.image_processor.is_override)
 
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -547,7 +534,7 @@ def test_structured_kwargs_nested(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -570,7 +557,7 @@ def test_structured_kwargs_nested_from_dict(self):
     # text + audio kwargs testing
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         feature_extractor = self.get_component("feature_extractor")
@@ -587,7 +574,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
 
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         feature_extractor = self.get_component("feature_extractor")
@@ -605,7 +592,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
 
     @require_torch
     def test_unstructured_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         feature_extractor = self.get_component("feature_extractor")
@@ -623,7 +610,7 @@ def test_unstructured_kwargs_audio(self):
 
     @require_torch
     def test_doubly_passed_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         feature_extractor = self.get_component("feature_extractor")
@@ -646,7 +633,7 @@ def test_doubly_passed_kwargs_audio(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_audio_nested(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         feature_extractor = self.get_component("feature_extractor")
@@ -670,7 +657,7 @@ def test_structured_kwargs_audio_nested(self):
         self.assertEqual(len(inputs[self.text_input_name][0]), 76)
 
     def test_tokenizer_defaults_preserved_by_kwargs_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
@@ -689,7 +676,7 @@ def test_video_processor_defaults_preserved_by_video_kwargs(self):
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["video_processor"] = self.get_component(
@@ -708,7 +695,7 @@ def test_video_processor_defaults_preserved_by_video_kwargs(self):
         self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
 
     def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
@@ -729,7 +716,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 162)
 
     def test_kwargs_overrides_default_video_processor_kwargs(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["video_processor"] = self.get_component(
@@ -755,7 +742,7 @@ def test_kwargs_overrides_default_video_processor_kwargs(self):
         self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
 
     def test_unstructured_kwargs_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -779,7 +766,7 @@ def test_unstructured_kwargs_video(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
 
     def test_unstructured_kwargs_batched_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -806,7 +793,7 @@ def test_unstructured_kwargs_batched_video(self):
         )
 
     def test_doubly_passed_kwargs_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -826,7 +813,7 @@ def test_doubly_passed_kwargs_video(self):
             )
 
     def test_structured_kwargs_nested_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -850,7 +837,7 @@ def test_structured_kwargs_nested_video(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
 
     def test_structured_kwargs_nested_from_dict_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
@@ -873,7 +860,7 @@ def test_structured_kwargs_nested_from_dict_video(self):
     # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
     # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
     def test_overlapping_text_image_kwargs_handling(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         processor_components = self.prepare_components()
@@ -898,7 +885,7 @@ def test_overlapping_text_audio_kwargs_handling(self):
         Checks that `padding`, or any other overlap arg between audio extractor and tokenizer
         is be passed to only text and ignored for audio for BC purposes
         """
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         processor_components = self.prepare_components()
@@ -913,31 +900,6 @@ def test_overlapping_text_audio_kwargs_handling(self):
         # padding = True should not raise an error and will if the audio processor popped its value to None
         _ = processor(text=input_str, audio=raw_speech, padding=True, return_tensors="pt")
 
-    def test_prepare_and_validate_optional_call_args(self):
-        processor = self.get_processor()
-        optional_call_args_name = getattr(processor, "optional_call_args", [])
-        num_optional_call_args = len(optional_call_args_name)
-        if num_optional_call_args == 0:
-            self.skipTest("No optional call args")
-        # test all optional call args are given
-        optional_call_args = processor.prepare_and_validate_optional_call_args(
-            *(f"optional_{i}" for i in range(num_optional_call_args))
-        )
-        self.assertEqual(
-            optional_call_args, {arg_name: f"optional_{i}" for i, arg_name in enumerate(optional_call_args_name)}
-        )
-        # test only one optional call arg is given
-        optional_call_args = processor.prepare_and_validate_optional_call_args("optional_1")
-        self.assertEqual(optional_call_args, {optional_call_args_name[0]: "optional_1"})
-        # test no optional call arg is given
-        optional_call_args = processor.prepare_and_validate_optional_call_args()
-        self.assertEqual(optional_call_args, {})
-        # test too many optional call args are given
-        with self.assertRaises(ValueError):
-            processor.prepare_and_validate_optional_call_args(
-                *(f"optional_{i}" for i in range(num_optional_call_args + 1))
-            )
-
     def test_chat_template_save_loading(self):
         processor = self.processor_class.from_pretrained(self.tmpdirname)
         signature = inspect.signature(processor.__init__)
@@ -999,7 +961,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         # some models have only Fast image processor
@@ -1260,7 +1222,7 @@ def test_chat_template_audio_from_video(self):
         ):
             self.skipTest(f"{self.processor_class} does not support video inputs")
 
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         video_file_path = hf_hub_download(

From d5d5c58dc09d6c285a61793bd9393f0c1fd6c23d Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 16:12:37 +0000
Subject: [PATCH 02/56] remove all mentions of .attributes

---
 src/transformers/models/aria/modular_aria.py  |  4 ----
 .../models/auto/image_processing_auto.py      |  5 ++--
 .../models/bark/processing_bark.py            |  2 --
 .../models/bros/processing_bros.py            |  1 -
 .../models/deepseek_vl/modular_deepseek_vl.py |  5 ----
 .../models/evolla/processing_evolla.py        | 19 +--------------
 .../models/florence2/modular_florence2.py     |  4 ----
 .../processing_granite_speech.py              |  3 ---
 .../instructblip/processing_instructblip.py   | 11 +--------
 .../processing_instructblipvideo.py           | 11 +--------
 .../models/sam2_video/modular_sam2_video.py   |  4 ----
 src/transformers/processing_utils.py          | 13 +++++++++-
 tests/models/blip/test_processing_blip.py     |  2 +-
 .../test_processing_bridgetower.py            | 12 +++++-----
 .../models/colpali/test_processing_colpali.py | 18 +++++++-------
 .../colqwen2/test_processing_colqwen2.py      | 18 +++++++-------
 tests/models/csm/test_processing_csm.py       |  2 +-
 tests/models/fuyu/test_processing_fuyu.py     | 12 +++++-----
 tests/models/glm4v/test_processor_glm4v.py    |  2 +-
 .../internvl/test_processing_internvl.py      |  2 +-
 .../models/kosmos2/test_processing_kosmos2.py | 14 +++++------
 .../kosmos2_5/test_processor_kosmos2_5.py     | 12 +++++-----
 tests/models/mllama/test_processing_mllama.py |  2 +-
 .../pix2struct/test_processing_pix2struct.py  | 12 +++++-----
 .../test_processing_qwen2_5_omni.py           | 24 +++++++++----------
 .../qwen2_5_vl/test_processing_qwen2_5_vl.py  |  2 +-
 .../qwen2_vl/test_processing_qwen2_vl.py      |  2 +-
 .../test_processing_qwen3_omni_moe.py         | 24 +++++++++----------
 .../qwen3_vl/test_processing_qwen3_vl.py      |  2 +-
 .../models/smolvlm/test_processing_smolvlm.py |  4 ++--
 .../test_processing_video_llama_3.py          |  2 +-
 utils/create_dummy_models.py                  |  2 +-
 32 files changed, 102 insertions(+), 150 deletions(-)

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 4853ef361eb8..bb145eb53e84 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -941,10 +941,6 @@ class AriaProcessor(ProcessorMixin):
             A dictionary indicating size conversions for images.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AriaImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 4aeae5571c52..39fa89bb9cd4 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -102,9 +102,8 @@
             ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
             ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
             ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
-            ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast"))(
-                "focalnet", ("BitImageProcessor", "BitImageProcessorFast")
-            ),
+            ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
             ("fuyu", ("FuyuImageProcessor", None)),
             ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 65a1700837d9..cff44d1ca9ac 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -49,8 +49,6 @@ class BarkProcessor(ProcessorMixin):
 
     """
 
-    attributes = ["tokenizer"]
-
     preset_shape = {
         "semantic_prompt": 1,  # 1D array of shape (X,)
         "coarse_prompt": 2,  # 2D array of shape (2,X)
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
index fe58e17b12b6..d92b163955a7 100644
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@@ -46,7 +46,6 @@ class BrosProcessor(ProcessorMixin):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
 
-    attributes = ["tokenizer"]
     valid_processor_kwargs = BrosProcessorKwargs
 
     def __init__(self, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
index 36c5aee2569f..3fea0abeb1e5 100644
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -220,11 +220,6 @@ class DeepseekVLProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "num_image_tokens"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 464ded903105..0a209052aa4f 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -47,11 +47,7 @@ class EvollaProcessor(ProcessorMixin):
             The maximum length of the text to be generated.
     """
 
-    attributes = ["protein_tokenizer", "tokenizer"]
     valid_kwargs = ["sequence_max_length"]
-    # protein_tokenizer_class = "EsmTokenizer"
-    # tokenizer_class = "LlamaTokenizerFast"
-    protein_tokenizer_class = "AutoTokenizer"
     protein_tokenizer_dir_name = "protein_tokenizer"
     # tokenizer_dir_name = "text_tokenizer"
 
@@ -210,20 +206,7 @@ def protein_decode(self, *args, **kwargs):
     def save_pretrained(self, save_directory, **kwargs):
         # only save the protein tokenizer in sub_dir
         self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name))
-
-        # we modify the attributes so that only the text tokenizer are saved in the main folder
-        protein_tokenizer_present = "protein_tokenizer" in self.attributes
-        # find the correct position of it in the attributes list
-        protein_tokenizer_index = self.attributes.index("protein_tokenizer") if protein_tokenizer_present else None
-        if protein_tokenizer_present and protein_tokenizer_index is not None:
-            self.attributes.remove("protein_tokenizer")
-
-        outputs = super().save_pretrained(save_directory, **kwargs)
-
-        if protein_tokenizer_present and protein_tokenizer_index is not None:
-            self.attributes.insert(protein_tokenizer_index, "protein_tokenizer")
-
-        return outputs
+        return super().save_pretrained(save_directory, exclude_attributes=["protein_tokenizer"], **kwargs)
 
     # overwrite to load the protein tokenizer from a separate folder
     # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py
index 3f048a8b60e8..910f6024a080 100644
--- a/src/transformers/models/florence2/modular_florence2.py
+++ b/src/transformers/models/florence2/modular_florence2.py
@@ -256,10 +256,6 @@ class Florence2Processor(ProcessorMixin):
             thresholds, or banned tokens.
     """
 
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index b3d3a8719185..910840bd661c 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -30,9 +30,6 @@
 
 
 class GraniteSpeechProcessor(ProcessorMixin):
-    attributes = ["audio_processor", "tokenizer"]
-    audio_processor_class = "GraniteSpeechFeatureExtractor"
-
     def __init__(
         self,
         audio_processor,
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 2ae4a0a8a229..9c6134d6aef1 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -157,16 +157,7 @@ def save_pretrained(self, save_directory, **kwargs):
         qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
         self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
 
-        # We modify the attributes so that only the tokenizer and image processor are saved in the main folder
-        qformer_present = "qformer_tokenizer" in self.attributes
-        if qformer_present:
-            self.attributes.remove("qformer_tokenizer")
-
-        outputs = super().save_pretrained(save_directory, **kwargs)
-
-        if qformer_present:
-            self.attributes += ["qformer_tokenizer"]
-        return outputs
+        return super().save_pretrained(save_directory, exclude_attributes=["qformer_tokenizer"], **kwargs)
 
     # overwrite to load the Q-Former tokenizer from a separate folder
     @classmethod
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index f609b3e1be0c..3e4a553d9dfa 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -183,16 +183,7 @@ def save_pretrained(self, save_directory, **kwargs):
         qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
         self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
 
-        # We modify the attributes so that only the tokenizer and image processor are saved in the main folder
-        qformer_present = "qformer_tokenizer" in self.attributes
-        if qformer_present:
-            self.attributes.remove("qformer_tokenizer")
-
-        outputs = super().save_pretrained(save_directory, **kwargs)
-
-        if qformer_present:
-            self.attributes += ["qformer_tokenizer"]
-        return outputs
+        return super().save_pretrained(save_directory, exclude_attributes=["qformer_tokenizer"], **kwargs)
 
     # overwrite to load the Q-Former tokenizer from a separate folder
     @classmethod
diff --git a/src/transformers/models/sam2_video/modular_sam2_video.py b/src/transformers/models/sam2_video/modular_sam2_video.py
index f6ed7a49fd43..c20809752592 100644
--- a/src/transformers/models/sam2_video/modular_sam2_video.py
+++ b/src/transformers/models/sam2_video/modular_sam2_video.py
@@ -620,10 +620,6 @@ class Sam2VideoProcessor(Sam2Processor):
             The value used for padding input points.
     """
 
-    attributes = ["image_processor", "video_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
-    video_processor_class = "Sam2VideoVideoProcessor"
-
     def __init__(
         self, image_processor, video_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs
     ):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 91449d26922c..ba422bc2deda 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -782,7 +782,14 @@ def __repr__(self):
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
 
-    def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_serialization: bool = True, **kwargs):
+    def save_pretrained(
+        self,
+        save_directory,
+        push_to_hub: bool = False,
+        legacy_serialization: bool = True,
+        exclude_attributes: Optional[list[str]] = None,
+        **kwargs,
+    ):
         """
         Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
         can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
@@ -807,6 +814,8 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
                 Whether or not to save processor attributes in separate config files (legacy) or in processor's config
                 file as a nested dict. Saving all attributes in a single dict will become the default in future versions.
                 Set to `legacy_serialization=True` until then.
+            exclude_attributes (`list[str]`, *optional*):
+                A list of attributes to exclude from saving.
             kwargs (`dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
@@ -841,6 +850,8 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
         save_jinja_files = kwargs.get("save_jinja_files", True)
 
         for attribute_name in self.get_attributes():
+            if exclude_attributes and attribute_name in exclude_attributes:
+                continue
             # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
             if attribute_name == "tokenizer":
                 attribute = getattr(self, attribute_name)
diff --git a/tests/models/blip/test_processing_blip.py b/tests/models/blip/test_processing_blip.py
index d9f045332ed3..0ee96029a82d 100644
--- a/tests/models/blip/test_processing_blip.py
+++ b/tests/models/blip/test_processing_blip.py
@@ -132,7 +132,7 @@ def test_tokenizer_decode(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
diff --git a/tests/models/bridgetower/test_processing_bridgetower.py b/tests/models/bridgetower/test_processing_bridgetower.py
index 60989eb91d9a..ebaa2e6a0d07 100644
--- a/tests/models/bridgetower/test_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_processing_bridgetower.py
@@ -61,7 +61,7 @@ def tearDownClass(cls):
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component(
             "image_processor",
@@ -81,7 +81,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         image_processor = self.get_component("image_processor")
@@ -109,7 +109,7 @@ def test_structured_kwargs_nested_from_dict(self):
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", crop_size={"shortest_edge": 234})
         tokenizer = self.get_component("tokenizer", max_length=117)
@@ -126,7 +126,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -152,7 +152,7 @@ def test_unstructured_kwargs_batched(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -178,7 +178,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py
index 119af1432ce1..ca849408af0e 100644
--- a/tests/models/colpali/test_processing_colpali.py
+++ b/tests/models/colpali/test_processing_colpali.py
@@ -120,7 +120,7 @@ def test_process_queries(self):
     # The following tests override the parent tests because ColPaliProcessor can only take one of images or text as input at a time.
 
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -137,7 +137,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -154,7 +154,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
@@ -166,7 +166,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
 
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -183,7 +183,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -202,7 +202,7 @@ def test_unstructured_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -221,7 +221,7 @@ def test_unstructured_kwargs_batched(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_doubly_passed_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -237,7 +237,7 @@ def test_doubly_passed_kwargs(self):
             )
 
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -258,7 +258,7 @@ def test_structured_kwargs_nested(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py
index 236456dd7f88..5923754f717c 100644
--- a/tests/models/colqwen2/test_processing_colqwen2.py
+++ b/tests/models/colqwen2/test_processing_colqwen2.py
@@ -119,7 +119,7 @@ def test_process_queries(self):
     # The following tests override the parent tests because ColQwen2Processor can only take one of images or text as input at a time.
 
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -136,7 +136,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -153,7 +153,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
@@ -165,7 +165,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
 
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
@@ -182,7 +182,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -201,7 +201,7 @@ def test_unstructured_kwargs(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -220,7 +220,7 @@ def test_unstructured_kwargs_batched(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_doubly_passed_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -236,7 +236,7 @@ def test_doubly_passed_kwargs(self):
             )
 
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
@@ -257,7 +257,7 @@ def test_structured_kwargs_nested(self):
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
diff --git a/tests/models/csm/test_processing_csm.py b/tests/models/csm/test_processing_csm.py
index 33a7ed5ebd56..4587e19c41b7 100644
--- a/tests/models/csm/test_processing_csm.py
+++ b/tests/models/csm/test_processing_csm.py
@@ -81,7 +81,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         # some models have only Fast image processor
diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processing_fuyu.py
index 16714d1ece16..d88843c6d158 100644
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -197,7 +197,7 @@ def test_fuyu_processing_multiple_image_sample(self):
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
@@ -225,7 +225,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -243,7 +243,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -270,7 +270,7 @@ def test_structured_kwargs_nested(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         image_processor = self.get_component("image_processor")
@@ -296,7 +296,7 @@ def test_structured_kwargs_nested_from_dict(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -321,7 +321,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py
index 6c607b0fa340..b22cdce7a4e9 100644
--- a/tests/models/glm4v/test_processor_glm4v.py
+++ b/tests/models/glm4v/test_processor_glm4v.py
@@ -78,7 +78,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/tests/models/internvl/test_processing_internvl.py b/tests/models/internvl/test_processing_internvl.py
index bbb4df973da6..154b02b17da8 100644
--- a/tests/models/internvl/test_processing_internvl.py
+++ b/tests/models/internvl/test_processing_internvl.py
@@ -292,7 +292,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index c2c98882ef02..e7e7f4b39f48 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -72,7 +72,7 @@ def setUpClass(cls):
 
     # We override this method to take the fast tokenizer by default
     def get_component(self, attribute, **kwargs):
-        assert attribute in self.processor_class.attributes
+        assert attribute in self.processor_class.get_attributes()
         component_class_name = getattr(self.processor_class, f"{attribute}_class")
         if isinstance(component_class_name, tuple):
             if attribute == "image_processor":
@@ -474,7 +474,7 @@ def check(texts, bboxes, expected_input_ids):
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
@@ -499,7 +499,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -525,7 +525,7 @@ def test_structured_kwargs_nested(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         image_processor = self.get_component("image_processor")
@@ -549,7 +549,7 @@ def test_structured_kwargs_nested_from_dict(self):
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -567,7 +567,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -592,7 +592,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
diff --git a/tests/models/kosmos2_5/test_processor_kosmos2_5.py b/tests/models/kosmos2_5/test_processor_kosmos2_5.py
index 1bc41307712c..7827b67f8bd6 100644
--- a/tests/models/kosmos2_5/test_processor_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_processor_kosmos2_5.py
@@ -156,7 +156,7 @@ def test_model_input_names(self):
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         # Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -174,7 +174,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         # Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=4096)
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -192,7 +192,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
     @require_vision
     def test_unstructured_kwargs(self):
         # Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -218,7 +218,7 @@ def test_unstructured_kwargs(self):
     @require_vision
     def test_unstructured_kwargs_batched(self):
         # Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -245,7 +245,7 @@ def test_unstructured_kwargs_batched(self):
     @require_vision
     def test_structured_kwargs_nested(self):
         # Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -274,7 +274,7 @@ def test_structured_kwargs_nested(self):
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
         # Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         image_processor = self.get_component("image_processor")
diff --git a/tests/models/mllama/test_processing_mllama.py b/tests/models/mllama/test_processing_mllama.py
index 50a6b7db0f4e..ebf4c22007ec 100644
--- a/tests/models/mllama/test_processing_mllama.py
+++ b/tests/models/mllama/test_processing_mllama.py
@@ -372,7 +372,7 @@ def test_process_interleaved_images_prompts_image_error(self):
     def test_unstructured_kwargs_batched(self):
         # Overridden because Mllama expects images in nested format. For 2 images it can't infer
         # the correct nesting, so we better throw an error
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
diff --git a/tests/models/pix2struct/test_processing_pix2struct.py b/tests/models/pix2struct/test_processing_pix2struct.py
index 0582bc857a5a..faad04bb4843 100644
--- a/tests/models/pix2struct/test_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_processing_pix2struct.py
@@ -169,7 +169,7 @@ def test_tokenizer_decode(self):
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -187,7 +187,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=4096)
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
@@ -205,7 +205,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
     @require_vision
     def test_unstructured_kwargs(self):
         # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -231,7 +231,7 @@ def test_unstructured_kwargs(self):
     @require_vision
     def test_unstructured_kwargs_batched(self):
         # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -258,7 +258,7 @@ def test_unstructured_kwargs_batched(self):
     @require_vision
     def test_structured_kwargs_nested(self):
         # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
@@ -287,7 +287,7 @@ def test_structured_kwargs_nested(self):
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
         # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
         image_processor = self.get_component("image_processor")
diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
index c988e2d72917..91fb5ffcf087 100644
--- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
@@ -58,7 +58,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     #  text + audio kwargs testing
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -69,7 +69,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
             self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -91,7 +91,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_audio_nested(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -100,7 +100,7 @@ def test_structured_kwargs_audio_nested(self):
             tokenizer = self.get_component("tokenizer")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -129,7 +129,7 @@ def test_structured_kwargs_audio_nested(self):
 
     @require_torch
     def test_unstructured_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -138,7 +138,7 @@ def test_unstructured_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer", max_length=117)
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -167,7 +167,7 @@ def test_unstructured_kwargs_audio(self):
 
     @require_torch
     def test_doubly_passed_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -176,7 +176,7 @@ def test_doubly_passed_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -189,7 +189,7 @@ def test_doubly_passed_kwargs_audio(self):
 
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -198,7 +198,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer", max_length=117)
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -334,7 +334,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
@@ -556,7 +556,7 @@ def test_chat_template_audio_from_video(self):
         ):
             self.skipTest(f"{self.processor_class} does not support video inputs")
 
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         video_file_path = hf_hub_download(
diff --git a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
index d5b72c803b6f..608dbfa5414e 100644
--- a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
@@ -152,7 +152,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
index 2183bd8c9609..6e83c998cb95 100644
--- a/tests/models/qwen2_vl/test_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
@@ -153,7 +153,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/tests/models/qwen3_omni_moe/test_processing_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_processing_qwen3_omni_moe.py
index 4c370e9286ed..43fbd1808b89 100644
--- a/tests/models/qwen3_omni_moe/test_processing_qwen3_omni_moe.py
+++ b/tests/models/qwen3_omni_moe/test_processing_qwen3_omni_moe.py
@@ -59,7 +59,7 @@ class Qwen3OmniMoeProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     #  text + audio kwargs testing
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -70,7 +70,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
             self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -92,7 +92,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_audio_nested(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -101,7 +101,7 @@ def test_structured_kwargs_audio_nested(self):
             tokenizer = self.get_component("tokenizer")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -130,7 +130,7 @@ def test_structured_kwargs_audio_nested(self):
 
     @require_torch
     def test_unstructured_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -139,7 +139,7 @@ def test_unstructured_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer", max_length=117)
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -168,7 +168,7 @@ def test_unstructured_kwargs_audio(self):
 
     @require_torch
     def test_doubly_passed_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -177,7 +177,7 @@ def test_doubly_passed_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer")
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -190,7 +190,7 @@ def test_doubly_passed_kwargs_audio(self):
 
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
         feature_extractor = self.get_component("feature_extractor")
         if hasattr(self, "get_tokenizer"):
@@ -199,7 +199,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
             tokenizer = self.get_component("tokenizer", max_length=117)
         if not tokenizer.pad_token:
             tokenizer.pad_token = "[TEST_PAD]"
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -335,7 +335,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
@@ -557,7 +557,7 @@ def test_chat_template_audio_from_video(self):
         ):
             self.skipTest(f"{self.processor_class} does not support video inputs")
 
-        if "feature_extractor" not in self.processor_class.attributes:
+        if "feature_extractor" not in self.processor_class.get_attributes():
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
 
         video_file_path = hf_hub_download(
diff --git a/tests/models/qwen3_vl/test_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_processing_qwen3_vl.py
index 9ce056a207ac..979cf53680e1 100644
--- a/tests/models/qwen3_vl/test_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_processing_qwen3_vl.py
@@ -163,7 +163,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py
index 40aaaf7a6ca2..edfa8625b17d 100644
--- a/tests/models/smolvlm/test_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_processing_smolvlm.py
@@ -437,7 +437,7 @@ def test_apply_chat_template_video_frame_sampling(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
+        if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         video_processor = self.get_component("video_processor")
@@ -468,7 +468,7 @@ def test_unstructured_kwargs_batched(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched_video(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_kwargs = self.prepare_processor_dict()
diff --git a/tests/models/video_llama_3/test_processing_video_llama_3.py b/tests/models/video_llama_3/test_processing_video_llama_3.py
index 18d53ac8b4b1..0b55b84e3aa2 100644
--- a/tests/models/video_llama_3/test_processing_video_llama_3.py
+++ b/tests/models/video_llama_3/test_processing_video_llama_3.py
@@ -174,7 +174,7 @@ def _test_apply_chat_template(
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
-        if processor_name not in self.processor_class.attributes:
+        if processor_name not in self.processor_class.get_attributes():
             self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
 
         batch_messages = [
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index d05ea7800176..b1b43abc5c50 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -300,7 +300,7 @@ def build_processor(config_class, processor_class, allow_no_checkpoint=False):
         # Try to build each component (tokenizer & feature extractor) of a `ProcessorMixin`.
         if issubclass(processor_class, ProcessorMixin):
             attrs = {}
-            for attr_name in processor_class.attributes:
+            for attr_name in processor_class.get_attributes():
                 attrs[attr_name] = []
                 # This could be a tuple (for tokenizers). For example, `CLIPProcessor` has
                 #   - feature_extractor_class = "CLIPFeatureExtractor"

From dd505b530fe6af8f1b24ff581742329d0c5b75ab Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 16:26:57 +0000
Subject: [PATCH 03/56] cleanup

---
 src/transformers/processing_utils.py | 3 ---
 tests/test_processing_common.py      | 1 -
 2 files changed, 4 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ba422bc2deda..3065666bbfb7 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -601,8 +601,6 @@ def __init__(self, *args, **kwargs):
 
         # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
         for attribute_name, arg in kwargs.items():
-            print("attribute_name", attribute_name)
-
             self.check_argument_for_proper_class(attribute_name, arg)
             setattr(self, attribute_name, arg)
 
@@ -1563,7 +1561,6 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
                 sub_processor_type = "tokenizer"
             if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
-                print("auto_processor_class", auto_processor_class)
                 args.append(auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index fdf60f373c7f..06b594d2c2cd 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -106,7 +106,6 @@ def prepare_processor_dict():
         return {}
 
     def get_component(self, attribute, **kwargs):
-        # determine from current file name
         if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
             attribute = "tokenizer"
         auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]

From 6a1448f8fddfe45902fd4082aba38e1440350500 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 17:27:14 +0000
Subject: [PATCH 04/56] fix processor tests

---
 tests/models/align/test_processing_align.py    |  5 +++++
 .../models/clipseg/test_processing_clipseg.py  |  5 +++++
 tests/models/flava/test_processing_flava.py    |  5 +++++
 .../models/kosmos2/test_processing_kosmos2.py  | 18 ------------------
 tests/models/owlvit/test_processing_owlvit.py  |  5 +++++
 tests/test_processing_common.py                |  5 ++++-
 6 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/tests/models/align/test_processing_align.py b/tests/models/align/test_processing_align.py
index 0adfc5a82205..21bb84cf59c5 100644
--- a/tests/models/align/test_processing_align.py
+++ b/tests/models/align/test_processing_align.py
@@ -71,6 +71,11 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = EfficientNetImageProcessor.from_pretrained(self.tmpdirname)
+        image_processor.save_pretrained(self.tmpdirname)
+        tokenizer = BertTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
diff --git a/tests/models/clipseg/test_processing_clipseg.py b/tests/models/clipseg/test_processing_clipseg.py
index f7255838caa8..40c6092218bd 100644
--- a/tests/models/clipseg/test_processing_clipseg.py
+++ b/tests/models/clipseg/test_processing_clipseg.py
@@ -64,6 +64,11 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = ViTImageProcessor.from_pretrained(self.tmpdirname)
+        image_processor.save_pretrained(self.tmpdirname)
+        tokenizer = CLIPTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs):
         return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
diff --git a/tests/models/flava/test_processing_flava.py b/tests/models/flava/test_processing_flava.py
index 10a00a869915..10523897ad69 100644
--- a/tests/models/flava/test_processing_flava.py
+++ b/tests/models/flava/test_processing_flava.py
@@ -80,6 +80,11 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = FlavaImageProcessor.from_pretrained(self.tmpdirname)
+        image_processor.save_pretrained(self.tmpdirname)
+        tokenizer = BertTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index e7e7f4b39f48..56b193eda110 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -22,7 +22,6 @@
 import pytest
 
 from transformers.image_utils import load_image
-from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.testing_utils import (
     get_tests_dir,
     require_sentencepiece,
@@ -70,23 +69,6 @@ def setUpClass(cls):
         processor = Kosmos2Processor(image_processor, fast_tokenizer)
         processor.save_pretrained(cls.tmpdirname)
 
-    # We override this method to take the fast tokenizer by default
-    def get_component(self, attribute, **kwargs):
-        assert attribute in self.processor_class.get_attributes()
-        component_class_name = getattr(self.processor_class, f"{attribute}_class")
-        if isinstance(component_class_name, tuple):
-            if attribute == "image_processor":
-                component_class_name = component_class_name[0]
-            else:
-                component_class_name = component_class_name[-1]
-
-        component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
-        if attribute == "tokenizer" and not component.pad_token:
-            component.pad_token = "[TEST_PAD]"
-
-        return component
-
     def get_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
 
diff --git a/tests/models/owlvit/test_processing_owlvit.py b/tests/models/owlvit/test_processing_owlvit.py
index 46a7881b786a..aaf0928115af 100644
--- a/tests/models/owlvit/test_processing_owlvit.py
+++ b/tests/models/owlvit/test_processing_owlvit.py
@@ -64,6 +64,11 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = OwlViTImageProcessor.from_pretrained(self.tmpdirname)
+        image_processor.save_pretrained(self.tmpdirname)
+        tokenizer = CLIPTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
     def get_tokenizer(self, **kwargs):
         return CLIPTokenizer.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
 
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 06b594d2c2cd..ca4b603c9778 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -226,7 +226,10 @@ def test_processor_from_and_save_pretrained_as_nested_dict(self):
 
             # Try to load each attribute separately from saved directory
             for attribute in processor_first.get_attributes():
-                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
+                if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
+                    auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
+                else:
+                    auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
                 attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname)
                 attribute_first = getattr(processor_first, attribute)
 

From a29290003b4d51fa4a7a711f75562e311770cbbe Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 18:15:46 +0000
Subject: [PATCH 05/56] fix modular

---
 src/transformers/models/colqwen2/modular_colqwen2.py       | 3 ---
 src/transformers/models/deepseek_vl/modular_deepseek_vl.py | 2 ++
 src/transformers/models/glm4v/modular_glm4v.py             | 2 --
 src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py   | 2 --
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
index adea1617e459..15a99c7efe25 100644
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -65,9 +65,6 @@ class ColQwen2Processor(ColPaliProcessor):
         query_prefix (`str`, *optional*): A prefix to be used for the query.
     """
 
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
     def __init__(
         self,
         image_processor=None,
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
index 3fea0abeb1e5..2c46f0405676 100644
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -220,6 +220,8 @@ class DeepseekVLProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
+    valid_kwargs = ["chat_template", "num_image_tokens"]
+
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
index 104f59860291..0a5783e7c7bb 100644
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -1552,8 +1552,6 @@ class Glm4vProcessor(Qwen2VLProcessor):
             in a chat into a tokenizable string.
     """
 
-    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
         self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 68e18704667f..2f91cfe82506 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -852,8 +852,6 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
             in a chat into a tokenizable string.
     """
 
-    image_processor_class = "AutoImageProcessor"
-
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names

From 63a255d71a138a3d1bef4f82b66b8fdb8bb97eec Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 16 Oct 2025 10:48:55 +0000
Subject: [PATCH 06/56] remove last attributes

---
 src/transformers/models/auto/video_processing_auto.py  |  2 +-
 .../models/deepseek_vl/modular_deepseek_vl.py          |  2 --
 .../models/deepseek_vl/processing_deepseek_vl.py       |  2 --
 .../processing_deepseek_vl_hybrid.py                   |  2 --
 src/transformers/models/evolla/processing_evolla.py    | 10 ++--------
 .../models/instructblip/processing_instructblip.py     |  2 --
 src/transformers/models/mgp_str/processing_mgp_str.py  |  2 --
 src/transformers/processing_utils.py                   |  2 --
 8 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
index c660b2e7a1ff..b86e9f3c0ae6 100644
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@@ -64,7 +64,7 @@
             ("video_llava", "VideoLlavaVideoProcessor"),
             ("videomae", "VideoMAEVideoProcessor"),
             ("vjepa2", "VJEPA2VideoProcessor"),
-            ("video_llama_3", "VideoLlama3VideoProcessor"),  # PLACEHOLDER - needs proper video processor class
+            ("video_llama_3", "VideoLlama3VideoProcessor"),
         ]
     )
 
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
index 2c46f0405676..3fea0abeb1e5 100644
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -220,8 +220,6 @@ class DeepseekVLProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    valid_kwargs = ["chat_template", "num_image_tokens"]
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
index 0ebf46ac5ad0..22b1c2ab71dd 100644
--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -52,8 +52,6 @@ class DeepseekVLProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    valid_kwargs = ["chat_template", "num_image_tokens"]
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
index 9049495932f6..8f842db7346f 100644
--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -52,8 +52,6 @@ class DeepseekVLHybridProcessor(ProcessorMixin):
             The number of special image tokens used as placeholders for visual content in text sequences.
     """
 
-    valid_kwargs = ["chat_template", "num_image_tokens"]
-
     def __init__(
         self,
         image_processor,
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 0a209052aa4f..c6233d40823a 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -47,10 +47,6 @@ class EvollaProcessor(ProcessorMixin):
             The maximum length of the text to be generated.
     """
 
-    valid_kwargs = ["sequence_max_length"]
-    protein_tokenizer_dir_name = "protein_tokenizer"
-    # tokenizer_dir_name = "text_tokenizer"
-
     def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
         if protein_tokenizer is None:
             raise ValueError("You need to specify an `protein_tokenizer`.")
@@ -205,7 +201,7 @@ def protein_decode(self, *args, **kwargs):
     # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
     def save_pretrained(self, save_directory, **kwargs):
         # only save the protein tokenizer in sub_dir
-        self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name))
+        self.protein_tokenizer.save_pretrained(os.path.join(save_directory, "protein_tokenizer"))
         return super().save_pretrained(save_directory, exclude_attributes=["protein_tokenizer"], **kwargs)
 
     # overwrite to load the protein tokenizer from a separate folder
@@ -217,9 +213,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
         if isinstance(processor, tuple):
             processor = processor[0]
-        protein_tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path, subfolder=cls.protein_tokenizer_dir_name
-        )
+        protein_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="protein_tokenizer")
 
         processor.protein_tokenizer = protein_tokenizer
 
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 9c6134d6aef1..bc13362fb42a 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -65,8 +65,6 @@ class InstructBlipProcessor(ProcessorMixin):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
-    qformer_tokenizer_class = "AutoTokenizer"
-
     def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = AddedToken("<image>", normalized=False, special=True)
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 11f65f8cabd5..7686b43f00e8 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -50,8 +50,6 @@ class MgpstrProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    char_tokenizer_class = "MgpstrTokenizer"
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         self.char_tokenizer = tokenizer
         self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 3065666bbfb7..0dc8364353cf 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -560,8 +560,6 @@ class ProcessorMixin(PushToHubMixin):
 
     optional_attributes = ["chat_template", "audio_tokenizer"]
     # Names need to be attr_class for attr in attributes
-    feature_extractor_class = None
-    tokenizer_class = None
     _auto_class = None
     valid_processor_kwargs = ProcessingKwargs
 

From ef737597a86397b409cdefe13c983d5f87fe935a Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 16 Oct 2025 16:02:32 +0000
Subject: [PATCH 07/56] fixup

---
 docs/source/en/model_doc/cwm.md                         | 1 +
 docs/source/en/model_doc/lfm2_moe.md                    | 1 +
 docs/source/en/model_doc/video_llama_3.md               | 4 ++--
 src/transformers/models/auto/configuration_auto.py      | 1 +
 src/transformers/models/auto/feature_extraction_auto.py | 1 -
 src/transformers/models/auto/image_processing_auto.py   | 2 +-
 src/transformers/models/auto/video_processing_auto.py   | 2 +-
 7 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/cwm.md b/docs/source/en/model_doc/cwm.md
index c789d1abdab1..ada758cdbd97 100644
--- a/docs/source/en/model_doc/cwm.md
+++ b/docs/source/en/model_doc/cwm.md
@@ -16,6 +16,7 @@ limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.
 
 -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-10-09.*
 
 
 # Code World Model (CWM)
diff --git a/docs/source/en/model_doc/lfm2_moe.md b/docs/source/en/model_doc/lfm2_moe.md
index bdaaebaa6044..0b84df2593ac 100644
--- a/docs/source/en/model_doc/lfm2_moe.md
+++ b/docs/source/en/model_doc/lfm2_moe.md
@@ -16,6 +16,7 @@ limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.
 
 -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-10-07.*
 
 
 # Lfm2Moe
diff --git a/docs/source/en/model_doc/video_llama_3.md b/docs/source/en/model_doc/video_llama_3.md
index 2f8e15016742..f0f11c66d25b 100644
--- a/docs/source/en/model_doc/video_llama_3.md
+++ b/docs/source/en/model_doc/video_llama_3.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on 2025-01-22 and added to Hugging Face Transformers on 2025-10-02.*
+*This model was released on 2025-01-22 and added to Hugging Face Transformers on 2025-10-13.*
 
 # VideoLLaMA3
 
@@ -34,7 +34,7 @@ as input; 2) Vision-Language Alignment, which jointly tunes the vision encoder,
 <img src="https://github.com/DAMO-NLP-SG/VideoLLaMA3/raw/refs/heads/main/assets/pipeline.jpg"
 alt="drawing" width="600"/>
 
-<small> VideoLLaMA3 architecture. Taken from the <a href="https://arxiv.org/pdf/2501.13106">technical report.</a> </small>
+<small> VideoLLaMA3 architecture. Taken from the <a href="https://huggingface.co/papers/2501.13106">technical report.</a> </small>
 
 This model was contributed by [lkhl](https://huggingface.co/lkhl).
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c8142947b7c0..656e9cd2c0eb 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -224,6 +224,7 @@
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
+        ("layoutxlm", "LayoutLMv2Config"),
         ("led", "LEDConfig"),
         ("levit", "LevitConfig"),
         ("lfm2", "Lfm2Config"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 0f0b6660fc36..8494cf85b8b8 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -58,7 +58,6 @@
         ("moshi", "EncodecFeatureExtractor"),
         ("musicgen", "EncodecFeatureExtractor"),
         ("musicgen_melody", "MusicgenMelodyFeatureExtractor"),
-        ("parakeet", "ParakeetFeatureExtractor"),
         ("parakeet_ctc", "ParakeetFeatureExtractor"),
         ("parakeet_encoder", "ParakeetFeatureExtractor"),
         ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 39fa89bb9cd4..7f7eaf36b8e9 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -98,8 +98,8 @@
             ("edgetam", (None, "Sam2ImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor", None)),
             ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
-            ("emu3", ("Emu3ImageProcessor", None)),
             ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
+            ("emu3", ("Emu3ImageProcessor", None)),
             ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
             ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
             ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
index b86e9f3c0ae6..7d373fb76d0a 100644
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@@ -61,10 +61,10 @@
             ("qwen3_vl_moe", "Qwen3VLVideoProcessor"),
             ("sam2_video", "Sam2VideoVideoProcessor"),
             ("smolvlm", "SmolVLMVideoProcessor"),
+            ("video_llama_3", "VideoLlama3VideoProcessor"),
             ("video_llava", "VideoLlavaVideoProcessor"),
             ("videomae", "VideoMAEVideoProcessor"),
             ("vjepa2", "VJEPA2VideoProcessor"),
-            ("video_llama_3", "VideoLlama3VideoProcessor"),
         ]
     )
 

From f14ff3c99be673445fa7dc0572b350993f994ac0 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 16 Oct 2025 17:15:23 +0000
Subject: [PATCH 08/56] fixes after merge

---
 src/transformers/processing_utils.py          |  4 +--
 .../test_processing_grounding_dino.py         | 28 ++++++++-----------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 18bc256bb0bf..aec27a962366 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -692,7 +692,7 @@ def to_dict(self) -> dict[str, Any]:
         sig = inspect.signature(self.__init__)
         # Only save the attributes that are presented in the kwargs of `__init__`.
         # or in the attributes
-        attrs_to_save = list(sig.parameters) + self.__class__.attributes
+        attrs_to_save = list(sig.parameters) + self.__class__.get_attributes()
         # extra attributes to be kept
         attrs_to_save += ["auto_map"]
 
@@ -1192,7 +1192,7 @@ def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
 
         # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
         # If we don't pop, some specific kwargs will raise a warning or error
-        for unused_kwarg in cls.attributes + ["auto_map", "processor_class"]:
+        for unused_kwarg in cls.get_attributes() + ["auto_map", "processor_class"]:
             processor_dict.pop(unused_kwarg, None)
 
         # override processor_dict with given kwargs
diff --git a/tests/models/grounding_dino/test_processing_grounding_dino.py b/tests/models/grounding_dino/test_processing_grounding_dino.py
index 9ce0b841992e..30a478ada427 100644
--- a/tests/models/grounding_dino/test_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processing_grounding_dino.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
 import shutil
 import tempfile
@@ -23,7 +22,7 @@
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -52,21 +51,16 @@ def setUpClass(cls):
         with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        image_processor_map = {
-            "do_resize": True,
-            "size": None,
-            "do_normalize": True,
-            "image_mean": [0.5, 0.5, 0.5],
-            "image_std": [0.5, 0.5, 0.5],
-            "do_rescale": True,
-            "rescale_factor": 1 / 255,
-            "do_pad": True,
-        }
-        cls.image_processor_file = os.path.join(cls.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(cls.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
-        image_processor = GroundingDinoImageProcessor()
+        image_processor = GroundingDinoImageProcessor(
+            do_resize=True,
+            size=None,
+            do_normalize=True,
+            image_mean=[0.5, 0.5, 0.5],
+            image_std=[0.5, 0.5, 0.5],
+            do_rescale=True,
+            rescale_factor=1 / 255,
+            do_pad=True,
+        )
         tokenizer = BertTokenizer.from_pretrained(cls.from_pretrained_id)
 
         processor = GroundingDinoProcessor(image_processor, tokenizer)

From 0306430d9206bd0458b07bfe1e89ad1c2c4ec779 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 16 Oct 2025 21:39:28 +0000
Subject: [PATCH 09/56] fix wrong tokenizer in auto florence2

---
 src/transformers/models/auto/tokenization_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ed943b231ead..71d83000d561 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -259,7 +259,7 @@
         ("flaubert", ("FlaubertTokenizer", None)),
         ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
-        ("florence2", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("florence2", ("BartTokenizer", "BartTokenizerFast" if is_tokenizers_available() else None)),
         ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
         ("fsmt", ("FSMTTokenizer", None)),
         ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),

From 01cb81582821c8a719c0226ba39d9739c0296929 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 09:26:26 +0000
Subject: [PATCH 10/56] fix missing audio_processor + nits

---
 src/transformers/models/auto/tokenization_auto.py   |  4 ++--
 .../processing_kyutai_speech_to_text.py             |  3 +++
 .../models/parakeet/processing_parakeet.py          |  3 +++
 src/transformers/processing_utils.py                | 13 ++++++-------
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 71d83000d561..5619a42d8b21 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -363,7 +363,7 @@
             ),
         ),
         ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("kyutai_speech_to_text", (None, "PretrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("kyutai_speech_to_text", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
@@ -1209,7 +1209,7 @@ def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None,
                 The configuration corresponding to the model to register.
             slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                 The slow tokenizer to register.
-            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
+            fast_tokenizer_class ([`PreTrainedTokenizerFast`], *optional*):
                 The fast tokenizer to register.
         """
         if slow_tokenizer_class is None and fast_tokenizer_class is None:
diff --git a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
index 8670fc473c0c..53c6b7d395df 100644
--- a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@@ -36,5 +36,8 @@ class KyutaiSpeechToTextProcessor(ProcessorMixin):
 
     valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
 
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
 
 __all__ = ["KyutaiSpeechToTextProcessor"]
diff --git a/src/transformers/models/parakeet/processing_parakeet.py b/src/transformers/models/parakeet/processing_parakeet.py
index 5ee766e1fa0e..9d69f1458b60 100644
--- a/src/transformers/models/parakeet/processing_parakeet.py
+++ b/src/transformers/models/parakeet/processing_parakeet.py
@@ -39,6 +39,9 @@ class ParakeetProcessorKwargs(ProcessingKwargs, total=False):
 
 
 class ParakeetProcessor(ProcessorMixin):
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
     def __call__(
         self,
         audio: AudioInput,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index aec27a962366..5d09cc576f38 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -101,10 +101,8 @@ class _LazyAutoProcessorMapping(dict):
     _MAPPING_NAMES = {
         "image_processor": ("transformers.models.auto.image_processing_auto", "AutoImageProcessor"),
         "video_processor": ("transformers.models.auto.video_processing_auto", "AutoVideoProcessor"),
-        "feature_extractor": (
-            "transformers.models.auto.feature_extraction_auto",
-            "AutoFeatureExtractor",
-        ),
+        "feature_extractor": ("transformers.models.auto.feature_extraction_auto", "AutoFeatureExtractor"),
+        "audio_processor": ("transformers.models.auto.feature_extraction_auto", "AutoFeatureExtractor"),
         "tokenizer": ("transformers.models.auto.tokenization_auto", "AutoTokenizer"),
     }
 
@@ -124,8 +122,9 @@ def keys(self):
 
 MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping()
 
-AUTO_TO_BASE_CLASS_MAPPING = {
+MODALITY_TO_BASE_CLASS_MAPPING = {
     "audio_tokenizer": "DacModel",
+    "audio_processor": "FeatureExtractionMixin",
     "tokenizer": "PreTrainedTokenizerBase",
     "feature_extractor": "FeatureExtractionMixin",
     "image_processor": "ImageProcessingMixin",
@@ -664,9 +663,9 @@ def check_argument_for_proper_class(self, argument_name, argument):
         mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
         is returned.
         """
-        if argument_name not in AUTO_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
+        if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
             argument_name = "tokenizer"
-        class_name = AUTO_TO_BASE_CLASS_MAPPING.get(argument_name)
+        class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
         if isinstance(class_name, tuple):
             proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
         else:

From 49ec9068c2d26c11bcb0f308cb63c3f557a6a095 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 09:44:13 +0000
Subject: [PATCH 11/56] Override __init__ in NewProcessor and change
 hf-internal-testing-repo (temporarily)

---
 tests/models/auto/test_processor_auto.py | 34 ++++++++++++------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index c61700fe7e74..7646d1ddbfee 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -164,14 +164,16 @@ def test_processor_from_local_directory_from_model_config(self):
     def test_from_pretrained_dynamic_processor(self):
         # If remote code is not set, we will time out when asking whether to load the model.
         with self.assertRaises(ValueError):
-            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
         # If remote code is disabled, we can't load this config.
         with self.assertRaises(ValueError):
             processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+                "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
             )
 
-        processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True)
+        processor = AutoProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
+        )
         self.assertTrue(processor.special_attribute_present)
         self.assertEqual(processor.__class__.__name__, "NewProcessor")
 
@@ -186,7 +188,7 @@ def test_from_pretrained_dynamic_processor(self):
 
             # Test we can also load the slow version
             new_processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False
+                "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True, use_fast=False
             )
             new_tokenizer = new_processor.tokenizer
             self.assertTrue(new_tokenizer.special_attribute_present)
@@ -239,18 +241,22 @@ class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
         class NewTokenizer(BertTokenizer):
             special_attribute_present = False
 
+            def __init__(self, feature_extractor, tokenizer):
+                super().__init__(feature_extractor, tokenizer)
+
         class NewProcessor(ProcessorMixin):
-            feature_extractor_class = "AutoFeatureExtractor"
-            tokenizer_class = "AutoTokenizer"
             special_attribute_present = False
 
+            def __init__(self, feature_extractor, tokenizer):
+                super().__init__(feature_extractor, tokenizer)
+
         try:
             AutoConfig.register("custom", CustomConfig)
             AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
             AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
             AutoProcessor.register(CustomConfig, NewProcessor)
             # If remote code is not set, the default is to use local classes.
-            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
             self.assertEqual(processor.__class__.__name__, "NewProcessor")
             self.assertFalse(processor.special_attribute_present)
             self.assertFalse(processor.feature_extractor.special_attribute_present)
@@ -258,7 +264,7 @@ class NewProcessor(ProcessorMixin):
 
             # If remote code is disabled, we load the local ones.
             processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+                "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
             )
             self.assertEqual(processor.__class__.__name__, "NewProcessor")
             self.assertFalse(processor.special_attribute_present)
@@ -267,7 +273,7 @@ class NewProcessor(ProcessorMixin):
 
             # If remote is enabled, we load from the Hub.
             processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor", trust_remote_code=True
+                "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
             )
             self.assertEqual(processor.__class__.__name__, "NewProcessor")
             self.assertTrue(processor.special_attribute_present)
@@ -294,9 +300,6 @@ class NewTokenizer(BertTokenizer):
             pass
 
         class NewProcessor(ProcessorMixin):
-            feature_extractor_class = "AutoFeatureExtractor"
-            tokenizer_class = "AutoTokenizer"
-
             def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
                 super().__init__(feature_extractor, tokenizer)
 
@@ -310,7 +313,7 @@ def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_a
             AutoProcessor.register(CustomConfig, NewProcessor)
             # If remote code is not set, the default is to use local classes.
             processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor", processor_attr_2=False
+                "hf-internal-testing/test_dynamic_processor_updated", processor_attr_2=False
             )
             self.assertEqual(processor.__class__.__name__, "NewProcessor")
             self.assertEqual(processor.processor_attr_1, 1)
@@ -335,9 +338,6 @@ class NewTokenizer(BertTokenizer):
             pass
 
         class NewProcessor(ProcessorMixin):
-            feature_extractor_class = "NewFeatureExtractor"
-            tokenizer_class = "NewTokenizer"
-
             def __init__(self, feature_extractor, tokenizer):
                 super().__init__(feature_extractor, tokenizer)
 
@@ -348,7 +348,7 @@ def __init__(self, feature_extractor, tokenizer):
             AutoProcessor.register(CustomConfig, NewProcessor)
             # If remote code is not set, the default is to use local classes.
             processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor",
+                "hf-internal-testing/test_dynamic_processor_updated",
             )
             self.assertEqual(processor.__class__.__name__, "NewProcessor")
         finally:

From 946cc5c4e9a8bc747f589f74e8b0dbc11a5126cb Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 10:08:44 +0000
Subject: [PATCH 12/56] fix auto tokenizer test

---
 src/transformers/processing_utils.py     | 1 +
 tests/models/auto/test_processor_auto.py | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 5d09cc576f38..e8e28cea0b36 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1508,6 +1508,7 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
                 sub_processor_type = "tokenizer"
             if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
+                print("kwargs: ", kwargs)
                 args.append(auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index 7646d1ddbfee..56cf09b0e3b1 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -241,9 +241,6 @@ class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
         class NewTokenizer(BertTokenizer):
             special_attribute_present = False
 
-            def __init__(self, feature_extractor, tokenizer):
-                super().__init__(feature_extractor, tokenizer)
-
         class NewProcessor(ProcessorMixin):
             special_attribute_present = False
 

From b0cb3e07c5a4a23a78152728b0cb3f7094749bb5 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 10:22:06 +0000
Subject: [PATCH 13/56] add init to markup_lm

---
 src/transformers/models/markuplm/processing_markuplm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index be0dd9273d5a..5c2f181d35a6 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -45,6 +45,9 @@ class MarkupLMProcessor(ProcessorMixin):
 
     parse_html = True
 
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
     def __call__(
         self,
         html_strings=None,

From 3b9e8464d87e528d8017e2f37b09382bbb03c782 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 10:36:57 +0000
Subject: [PATCH 14/56] update CustomProcessor in custom_processing

---
 utils/test_module/custom_processing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/test_module/custom_processing.py b/utils/test_module/custom_processing.py
index 196fc511b65b..19c164716f41 100644
--- a/utils/test_module/custom_processing.py
+++ b/utils/test_module/custom_processing.py
@@ -2,5 +2,5 @@
 
 
 class CustomProcessor(ProcessorMixin):
-    feature_extractor_class = "AutoFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)

From 53de7a4cc9c923116a84371640a2dfce528a4688 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 17 Oct 2025 13:07:47 +0000
Subject: [PATCH 15/56] remove print

---
 src/transformers/processing_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index e8e28cea0b36..5d09cc576f38 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1508,7 +1508,6 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
                 sub_processor_type = "tokenizer"
             if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
-                print("kwargs: ", kwargs)
                 args.append(auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args

From 4a6b0806b3a7ba3ad331fe2bcf74e5a4f3bafcdf Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 22 Oct 2025 17:44:03 +0000
Subject: [PATCH 16/56] nit

---
 .../models/instructblipvideo/processing_instructblipvideo.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index facdfa231da9..02117a1d6470 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -85,7 +85,7 @@ def __call__(
         **kwargs,
     ) -> BatchFeature:
         """
-        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
+        This method uses [`InstructBlipVideoVideoProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.

From 757e1f1f331370f4a9d7d970cf34f5db45d01289 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 22 Oct 2025 18:19:10 +0000
Subject: [PATCH 17/56] fix test modeling owlv2

---
 tests/models/owlv2/test_modeling_owlv2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 2a14a35cefa3..098f47d1fe53 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -51,7 +51,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import OwlViTProcessor
+    from transformers import OwlViTImageProcessor, OwlViTProcessor
 
 
 # Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTVisionModelTester with OwlViT->Owlv2
@@ -615,7 +615,9 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
     def test_inference(self):
         model_name = "google/owlv2-base-patch16"
         model = Owlv2Model.from_pretrained(model_name).to(torch_device)
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
+        print("processor:", processor)
 
         image = prepare_img()
         inputs = processor(

From bf763b29abb407ac891817d1ab019c77dcf6b57e Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 22 Oct 2025 18:54:26 +0000
Subject: [PATCH 18/56] fix test_processing_layoutxlm

---
 tests/models/layoutxlm/test_processing_layoutxlm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/layoutxlm/test_processing_layoutxlm.py b/tests/models/layoutxlm/test_processing_layoutxlm.py
index 29a3687ebf0e..effbc9794353 100644
--- a/tests/models/layoutxlm/test_processing_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processing_layoutxlm.py
@@ -62,7 +62,9 @@ def setUpClass(cls):
         cls.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
 
         tokenizer = cls.get_tokenizer()
+        tokenizer.save_pretrained(cls.tmpdirname)
         image_processor = cls.get_image_processor()
+        image_processor.save_pretrained(cls.tmpdirname)
         processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
         processor.save_pretrained(cls.tmpdirname)
 

From 0799a0ae5e1281232b4d96b741475887f91b5a57 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 22 Oct 2025 19:32:30 +0000
Subject: [PATCH 19/56] Fix owlv2, wav2vec2, markuplm, voxtral issues

---
 src/transformers/processing_utils.py            |  2 +-
 .../markuplm/test_tokenization_markuplm.py      |  2 +-
 tests/models/owlv2/test_modeling_owlv2.py       | 17 ++++++++++-------
 .../test_processing_wav2vec2_with_lm.py         | 10 ++--------
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index a19d54138150..fffbbfd6eb86 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -124,7 +124,7 @@ def keys(self):
 MODALITY_TO_BASE_CLASS_MAPPING = {
     "audio_tokenizer": "DacModel",
     "audio_processor": "FeatureExtractionMixin",
-    "tokenizer": "PreTrainedTokenizerBase",
+    "tokenizer": ("PreTrainedTokenizerBase", "MistralCommonTokenizer"),
     "feature_extractor": "FeatureExtractionMixin",
     "image_processor": "ImageProcessingMixin",
     "video_processor": "BaseVideoProcessor",
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 8232269c53d4..5d4ff4621c01 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -1058,7 +1058,7 @@ def test_torch_encode_plus_sent_to_model(self):
                 nodes, xpaths = self.get_nodes_and_xpaths()
                 encoded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, return_tensors="pt")
                 batch_encoded_sequence = tokenizer.batch_encode_plus(
-                    [nodes, nodes], [xpaths, xpaths], return_tensors="pt"
+                    batch_text_or_text_pairs=[nodes, nodes], xpaths=[xpaths, xpaths], return_tensors="pt"
                 )
                 # This should not fail
 
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 098f47d1fe53..7d48148f1de1 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -648,7 +648,8 @@ def test_inference(self):
     def test_inference_interpolate_pos_encoding(self):
         model_name = "google/owlv2-base-patch16"
         model = Owlv2Model.from_pretrained(model_name).to(torch_device)
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
         processor.image_processor.size = {"height": 1024, "width": 1024}
 
         image = prepare_img()
@@ -711,7 +712,8 @@ def test_inference_interpolate_pos_encoding(self):
 
         # Deactivate interpolate_pos_encoding on same model, and use default image size.
         # Verify the dynamic change caused by the activation/deactivation of interpolate_pos_encoding of variables: self.sqrt_num_patches, self.box_bias from (OwlViTForObjectDetection).
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
 
         image = prepare_img()
         inputs = processor(
@@ -786,8 +788,8 @@ def test_inference_interpolate_pos_encoding(self):
     def test_inference_object_detection(self):
         model_name = "google/owlv2-base-patch16"
         model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
 
         image = prepare_img()
         text_labels = [["a photo of a cat", "a photo of a dog"]]
@@ -836,8 +838,8 @@ def test_inference_object_detection(self):
     def test_inference_one_shot_object_detection(self):
         model_name = "google/owlv2-base-patch16"
         model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
 
         image = prepare_img()
         query_image = prepare_img()
@@ -867,7 +869,8 @@ def test_inference_one_shot_object_detection_fp16(self):
         model_name = "google/owlv2-base-patch16"
         model = Owlv2ForObjectDetection.from_pretrained(model_name, dtype=torch.float16).to(torch_device)
 
-        processor = OwlViTProcessor.from_pretrained(model_name)
+        image_processor = OwlViTImageProcessor.from_pretrained(model_name)
+        processor = OwlViTProcessor.from_pretrained(model_name, image_processor=image_processor)
 
         image = prepare_img()
         query_image = prepare_img()
diff --git a/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
index 6124f727f4af..2553bd3273b8 100644
--- a/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processing_wav2vec2_with_lm.py
@@ -71,11 +71,6 @@ def setUp(self):
 
         # load decoder from hub
         self.decoder_name = "hf-internal-testing/ngram-beam-search-decoder"
-        feature_extractor = self.get_feature_extractor()
-        feature_extractor.save_pretrained(self.tmpdirname)
-        decoder = self.get_decoder()
-        decoder.save_to_dir(self.tmpdirname)
-
         feature_extractor = Wav2Vec2FeatureExtractor(**feature_extractor_map)
         processor = Wav2Vec2ProcessorWithLM(
             tokenizer=self.get_tokenizer(), feature_extractor=feature_extractor, decoder=self.get_decoder()
@@ -87,9 +82,8 @@ def get_tokenizer(self, **kwargs_init):
         kwargs.update(kwargs_init)
         return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
-    @classmethod
-    def get_feature_extractor(cls, **kwargs):
-        return Wav2Vec2FeatureExtractor.from_pretrained(cls.tmpdirname, **kwargs)
+    def get_feature_extractor(self, **kwargs):
+        return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_decoder(self, **kwargs):
         return BeamSearchDecoderCTC.load_from_hf_hub(self.decoder_name, **kwargs)

From e3f130da2cca972099aee3f7490e6d78df5a1920 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 31 Oct 2025 15:53:20 +0000
Subject: [PATCH 20/56] add support for loading and saving multiple tokenizer
 natively

---
 .../models/evolla/processing_evolla.py        | 24 -------------------
 .../instructblip/processing_instructblip.py   | 24 -------------------
 .../processing_instructblipvideo.py           | 24 -------------------
 src/transformers/processing_utils.py          | 14 ++++++++---
 tests/test_processing_common.py               | 11 +++++----
 5 files changed, 18 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index c6233d40823a..807bd294c406 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -16,14 +16,12 @@
 Processor class for EVOLLA.
 """
 
-import os
 from typing import Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import (
     ProcessorMixin,
 )
-from ..auto import AutoTokenizer
 
 
 PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
@@ -197,27 +195,5 @@ def protein_batch_decode(self, *args, **kwargs):
     def protein_decode(self, *args, **kwargs):
         return self.protein_tokenizer.decode(*args, **kwargs)
 
-    # overwrite to save the protein tokenizer in a separate folder
-    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
-    def save_pretrained(self, save_directory, **kwargs):
-        # only save the protein tokenizer in sub_dir
-        self.protein_tokenizer.save_pretrained(os.path.join(save_directory, "protein_tokenizer"))
-        return super().save_pretrained(save_directory, exclude_attributes=["protein_tokenizer"], **kwargs)
-
-    # overwrite to load the protein tokenizer from a separate folder
-    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
-        if isinstance(processor, tuple):
-            processor = processor[0]
-        protein_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="protein_tokenizer")
-
-        processor.protein_tokenizer = protein_tokenizer
-
-        return processor
-
 
 __all__ = ["EvollaProcessor"]
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index bc13362fb42a..cfed52f745ae 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -16,7 +16,6 @@
 Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
 """
 
-import os
 from typing import Optional, Union
 
 from ...image_processing_utils import BatchFeature
@@ -24,7 +23,6 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
-from ..auto import AutoTokenizer
 
 
 logger = logging.get_logger(__name__)
@@ -147,27 +145,5 @@ def model_input_names(self):
         qformer_input_names = ["qformer_input_ids", "qformer_attention_mask"]
         return tokenizer_input_names + image_processor_input_names + qformer_input_names
 
-    # overwrite to save the Q-Former tokenizer in a separate folder
-    def save_pretrained(self, save_directory, **kwargs):
-        if os.path.isfile(save_directory):
-            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
-        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
-
-        return super().save_pretrained(save_directory, exclude_attributes=["qformer_tokenizer"], **kwargs)
-
-    # overwrite to load the Q-Former tokenizer from a separate folder
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
-        if isinstance(processor, tuple):
-            processor = processor[0]
-        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
-        processor.qformer_tokenizer = qformer_tokenizer
-        return processor
-
 
 __all__ = ["InstructBlipProcessor"]
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 02117a1d6470..81d0103b2742 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -16,7 +16,6 @@
 Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
 """
 
-import os
 from typing import Optional, Union
 
 from ...image_processing_utils import BatchFeature
@@ -30,7 +29,6 @@
 )
 from ...utils import TensorType, logging
 from ...video_utils import VideoInput
-from ..auto import AutoTokenizer
 
 
 logger = logging.get_logger(__name__)
@@ -175,27 +173,5 @@ def model_input_names(self):
         qformer_input_names = ["qformer_input_ids", "qformer_attention_mask"]
         return tokenizer_input_names + video_processor_input_names + qformer_input_names
 
-    # overwrite to save the Q-Former tokenizer in a separate folder
-    def save_pretrained(self, save_directory, **kwargs):
-        if os.path.isfile(save_directory):
-            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
-        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
-
-        return super().save_pretrained(save_directory, exclude_attributes=["qformer_tokenizer"], **kwargs)
-
-    # overwrite to load the Q-Former tokenizer from a separate folder
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
-        if isinstance(processor, tuple):
-            processor = processor[0]
-        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
-        processor.qformer_tokenizer = qformer_tokenizer
-        return processor
-
 
 __all__ = ["InstructBlipVideoProcessor"]
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 40330bd74f79..ae0d23715fdb 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -831,6 +831,10 @@ def save_pretrained(
             # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
             if attribute_name == "tokenizer":
                 attribute.save_pretrained(save_directory)
+            # if a model has multiple tokenizers, save the additional tokenizers in their own folders.
+            # Note that the additional tokenizers must have "tokenizer" in their attribute name.
+            elif "tokenizer" in attribute_name:
+                attribute.save_pretrained(os.path.join(save_directory, attribute_name))
             elif attribute._auto_class is not None:
                 custom_object_save(attribute, save_directory, config=attribute)
 
@@ -1455,10 +1459,14 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
         sub_processors = cls.get_attributes()
         for sub_processor_type in sub_processors:
             if sub_processor_type not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in sub_processor_type:
-                sub_processor_type = "tokenizer"
-            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
+                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
+                sub_processor = auto_processor_class.from_pretrained(
+                    pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs
+                )
+            elif sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
-                args.append(auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+                sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            args.append(sub_processor)
 
         return args
 
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 0aef578b1645..962cd1200b95 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -108,9 +108,11 @@ def prepare_processor_dict():
 
     def get_component(self, attribute, **kwargs):
         if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
-            attribute = "tokenizer"
-        auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
-        component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+            auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
+            component = auto_processor_class.from_pretrained(self.tmpdirname, subfolder=attribute, **kwargs)  # noqa
+        else:
+            auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
+            component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
         if "tokenizer" in attribute and not component.pad_token:
             component.pad_token = "[TEST_PAD]"
             if component.pad_token_id is None:
@@ -228,9 +230,10 @@ def test_processor_from_and_save_pretrained_as_nested_dict(self):
             for attribute in processor_first.get_attributes():
                 if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
                     auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
+                    attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname, subfolder=attribute)
                 else:
                     auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
-                attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname)
+                    attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname)
                 attribute_first = getattr(processor_first, attribute)
 
                 # tokenizer repr contains model-path from where we loaded

From cc45a7ed5459369b90b10d9e719447784486b041 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 31 Oct 2025 16:01:54 +0000
Subject: [PATCH 21/56] remove exclude_attributes from save_pretrained

---
 src/transformers/processing_utils.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ae0d23715fdb..ae78255c6ee1 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -778,9 +778,7 @@ def __repr__(self):
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
 
-    def save_pretrained(
-        self, save_directory, push_to_hub: bool = False, exclude_attributes: Optional[list[str]] = None, **kwargs
-    ):
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
         """
         Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
         can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
@@ -801,8 +799,6 @@ def save_pretrained(
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            exclude_attributes (`list[str]`, *optional*):
-                A list of attributes to exclude from saving.
             kwargs (`dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
@@ -822,8 +818,6 @@ def save_pretrained(
             custom_object_save(self, save_directory, config=configs)
 
         for attribute_name in self.get_attributes():
-            if exclude_attributes and attribute_name in exclude_attributes:
-                continue
             attribute = getattr(self, attribute_name)
             if hasattr(attribute, "_set_processor_class"):
                 attribute._set_processor_class(self.__class__.__name__)

From 6b9e7c96b246f063aa8002ac2da108af2ab8c2f5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Sat, 1 Nov 2025 19:40:40 +0100
Subject: [PATCH 22/56] Run slow v2 (#41914)

* Super

* Super

* Super

* Super

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/check_failed_tests.yml    | 103 +++--
 .github/workflows/get-pr-info.yml           |   9 +
 .github/workflows/model_jobs.yml            |   6 +-
 .github/workflows/push-important-models.yml |   2 +-
 .github/workflows/self-comment-ci.yml       | 483 ++++++++------------
 .github/workflows/self-nightly-caller.yml   |   1 +
 .github/workflows/self-scheduled.yml        |  20 +-
 utils/check_bad_commit.py                   |   2 +-
 utils/notification_service.py               |  10 +
 utils/pr_slow_ci_models.py                  |   5 +-
 utils/process_bad_commit_report.py          |  34 +-
 utils/split_model_tests.py                  |  19 +-
 12 files changed, 355 insertions(+), 339 deletions(-)

diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml
index aa60275b588f..5f37e7f9541e 100644
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@@ -6,9 +6,6 @@ on:
       docker:
         required: true
         type: string
-      start_sha:
-        required: true
-        type: string
       job:
         required: true
         type: string
@@ -24,7 +21,13 @@ on:
       commit_sha:
         required: false
         type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}
 
 env:
   HF_HOME: /mnt/cache
@@ -88,27 +91,55 @@ jobs:
             echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
           fi
 
-          if [ -f setup_values/other_workflow_run_id.txt ]; then
-            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
       - name: Update clone
         working-directory: /transformers
         if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin ${{ inputs.commit_sha || github.sha }}
+          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
-      - name: Get target commit
+      - name: Get `START_SHA`
         working-directory: /transformers/utils
         if: ${{ env.process == 'true' }}
+        run: |
+          echo "START_SHA=${{ inputs.commit_sha || github.sha }}" >> $GITHUB_ENV
+
+      # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
+      - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
+        id: pr_info
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        uses: actions/github-script@v6
+        with:
+          script: |            
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            const { data: merge_commit }  = await github.rest.repos.getCommit({
+              owner: pr.base.repo.owner.login,
+              repo: pr.base.repo.name,
+              ref: pr.merge_commit_sha,
+            });
+
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
+
+      # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
+      # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
+      - name: Get `END_SHA` from previous CI runs of the same workflow
+        working-directory: /transformers/utils
+        if: ${{ env.process == 'true' && inputs.pr_number == '' }}
         run: |
           echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
 
-      - name: Checkout to `start_sha`
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.start_sha }}
+      # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
+      # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
+      # see if a reported failing test is actually ONLY failing on the `merge_commit`.
+      - name: Set `END_SHA`
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        run: |
+          echo "END_SHA=${{ steps.pr_info.outputs.merge_commit_base_sha }}" >> $GITHUB_ENV
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -138,7 +169,7 @@ jobs:
       - name: Check failed tests
         working-directory: /transformers
         if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ env.START_SHA }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
 
       - name: Show results
         working-directory: /transformers
@@ -159,6 +190,8 @@ jobs:
     if: needs.check_new_failures.outputs.process == 'true'
     runs-on:
       group: aws-g5-4xlarge-cache
+    outputs:
+      report: ${{ steps.set_output.outputs.report }}
     container:
       image: ${{ inputs.docker }}
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -190,18 +223,9 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
-
-      - name: Process report
-        shell: bash
-        working-directory: /transformers
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
         run: |
-          python3 utils/process_bad_commit_report.py
+          git fetch origin ${{ inputs.commit_sha || github.sha }}
+          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Process report
         shell: bash
@@ -218,6 +242,29 @@ jobs:
             echo EOF
           } >> "$GITHUB_ENV"
 
+      # The output is useful if a caller needs more processing, for example, we have a chain
+      # self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
+      # and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
+      - name: Show results & Set outputs
+        id: set_output
+        working-directory: /transformers
+        run: |
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json
+
+          {
+            echo 'report<<EOF'
+            cat new_failures_with_bad_commit.json
+            echo ''  # Force a newline
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: new_failures_with_bad_commit_${{ inputs.job }}
+          path: /transformers/new_failures_with_bad_commit.json
+
       - name: Prepare Slack report title
         working-directory: /transformers
         run: |
diff --git a/.github/workflows/get-pr-info.yml b/.github/workflows/get-pr-info.yml
index 989281e5b904..0f60c039349f 100644
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@@ -39,6 +39,9 @@ on:
       PR_MERGE_COMMIT_SHA:
         description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
         value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_MERGE_COMMIT_BASE_SHA:
+        description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
       PR_HEAD_COMMIT_DATE:
         description: "The date of the head sha of the pull request branch in the head repository"
         value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
@@ -74,6 +77,7 @@ jobs:
       PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
       PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
       PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
+      PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
       PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
       PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
       PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
@@ -122,6 +126,7 @@ jobs:
             core.setOutput('base_ref', pr.base.ref);
             core.setOutput('head_sha', pr.head.sha);
             core.setOutput('base_sha', pr.base.sha);
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
             core.setOutput('merge_commit_sha', pr.merge_commit_sha);
             core.setOutput('pr', pr);
 
@@ -142,6 +147,10 @@ jobs:
               date: merge_commit.commit.committer.date
             });
 
+            console.log('PR Info:', {
+              pr_info: pr
+            });
+
       - name: Convert dates to timestamps
         id: get_timestamps
         run: |
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
index 9c946d7974a1..69c84f22fe8d 100644
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@@ -80,7 +80,9 @@ jobs:
 
       - name: Update clone
         working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin ${{ inputs.commit_sha || github.sha }}
+          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
@@ -174,7 +176,7 @@ jobs:
 
   collated_reports:
     name: Collated Reports
-    if: ${{ always() }}
+    if: ${{ always() && inputs.runner_type != '' }}
     needs: run_models_gpu
     uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
     with:
diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml
index cf0dbb162386..60d63851da18 100644
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@@ -153,5 +153,5 @@ jobs:
       ci_event: push
       report_repo_id: hf-internal-testing/transformers_ci_push
       commit_sha: ${{ github.sha }}
-      models: ${{ needs.get_modified_models.outputs.matrix }}
+      subdirs: ${{ needs.get_modified_models.outputs.matrix }}
     secrets: inherit
diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml
index 2f81f4f0fe53..ab0ab412bb59 100644
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@@ -23,62 +23,34 @@ env:
   TF_FORCE_GPU_ALLOW_GROWTH: true
   CUDA_VISIBLE_DEVICES: 0,1
 
+
 jobs:
   get-pr-number:
-    runs-on: ubuntu-22.04
     name: Get PR number
-    # For security: only allow team members to run
     if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
-
-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
+    uses: ./.github/workflows/get-pr-number.yml
 
-  get-sha:
-    runs-on: ubuntu-22.04
+  get-pr-info:
+    name: Get PR commit SHA
     needs: get-pr-number
     if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+
+  check-timestamps:
+    name: Check timestamps (security check)
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
     outputs:
-      PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
-      PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
+      PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
+      PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
     steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
-
-      - name: Get SHA (and verify timestamps against the issue comment date)
-        id: get_sha
+      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
         env:
-          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
           COMMENT_DATE: ${{ github.event.comment.created_at }}
+          PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
         run: |
-            git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
-            git checkout refs/remotes/pull/$PR_NUMBER/head
-            echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
-            echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
-            git checkout refs/remotes/pull/$PR_NUMBER/merge
-            echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
-            echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
-            echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
             COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
             echo "COMMENT_DATE: $COMMENT_DATE"
             echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
@@ -87,13 +59,10 @@ jobs:
               exit -1;
             fi
 
-  # use a python script to handle this complex logic
-  # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
-  # case 2: `run-slow model_1, model_2`
+  # use a python script to handle this complex logic.
   get-tests:
     runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-sha]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    needs: [get-pr-number, check-timestamps]
     outputs:
       models: ${{ steps.models_to_run.outputs.models }}
       quantizations: ${{ steps.models_to_run.outputs.quantizations }}
@@ -101,11 +70,11 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+          ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"
 
       - name: Verify merge commit SHA
         env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+          VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
         run: |
             PR_MERGE_SHA=$(git log -1 --format=%H)
             if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
@@ -119,19 +88,39 @@ jobs:
         run: |
           python -m pip install GitPython
           python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
-          echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+          echo 'models=$(tail -n 1 output.txt)' >> $GITHUB_ENV
           python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
-          echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV
+          echo 'quantizations=$(tail -n 1 output2.txt)' >> $GITHUB_ENV
 
       - name: Show models to test
         id: models_to_run
         run: |
           echo "${{ env.models }}"
-          echo "models=${{ env.models }}" >> $GITHUB_ENV
           echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
           echo "${{ env.quantizations }}"
           echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
 
+  # Report back if we are not able to get the tests (for example, security check is failing)
+  report_error_earlier:
+    name: Report error earlier
+    if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
+    needs: [get-pr-number, get-pr-info, get-tests]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Reply to the comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
+            -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
+
   reply_to_comment:
     name: Reply to the comment
     if: ${{ needs.get-tests.outputs.models != '[]'  || needs.get-tests.outputs.quantizations != '[]' }}
@@ -143,20 +132,18 @@ jobs:
       - name: Reply to the comment
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
         run: |
           gh api \
             --method POST \
             -H "Accept: application/vnd.github+json" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
             repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
+            -f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e '${{ env.BODY }}')"
 
   create_run:
     name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-sha, get-tests, reply_to_comment]
+    needs: [check-timestamps, reply_to_comment]
     permissions:
       statuses: write
     runs-on: ubuntu-22.04
@@ -173,243 +160,179 @@ jobs:
             --method POST \
             -H "Accept: application/vnd.github+json" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
+            repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \
             -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"
 
-  run_models_gpu:
-    name: Run all tests for the model
+  model-ci:
+    name: Model CI
     if: ${{ needs.get-tests.outputs.models != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-       group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
-          echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
-  run_quantization_torch_gpu:
-    name: Run all tests for a quantization
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.models }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit
+
+  quantization-ci:
+    name: Quantization CI
     if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-quantization-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-quantization-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.quantizations }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit
+
+  report:
+    name: Check & Report
+    needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
+    permissions:
+      pull-requests: write
+      statuses: write
+    if: ${{ always() && needs.create_run.result == 'success' }}
+    runs-on: ubuntu-22.04
     steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
+      - name: Show reports from jobs
         run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
+          echo "${{ needs.model-ci.outputs.report }}"
+          echo "${{ needs.quantization-ci.outputs.report }}"
 
-      - name: Verify merge commit SHA
+      - name: Process and filter reports
         env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run quantization tests on GPU
-        working-directory: /transformers
+          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
+          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
         run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
+          # Preprocess with Python
+          python3 << 'PYTHON_SCRIPT'
+          import json
+          import os
+          
+          def filter_and_format_report(data):
+            """
+            Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
+            """
+            lines = []
+            
+            for model, model_result in data.items():
+                model_lines = []
+                for device, failures in model_result.items():
+                    
+                    # Filter out None commits and extract just the test names
+                    test_names = [
+                        failure['test'] 
+                        for failure in failures 
+                        if isinstance(failure, dict) and failure.get('commit') is not None
+                    ]
+
+                    # Add tests to model lines
+                    for idx, test_name in enumerate(test_names):
+                        if idx == 0:
+                            job_link = failures[idx]['job_link']
+                            model_lines.append(f"- [{model}]({job_link}):")
+          
+                        model_lines.append(f"    {test_name}")
+
+                # Only add model section if it has tests
+                if len(model_lines) > 0:
+                    lines.extend(model_lines)
+                    lines.append("")  # Empty line between models
+            
+            return "\n".join(lines).strip()
+          
+          # Load and filter reports
+          model_report_str = os.environ.get('MODEL_REPORT', '{}')
+          quant_report_str = os.environ.get('QUANT_REPORT', '{}')
+          
+          model_report = json.loads(model_report_str) if model_report_str else {}
+          quant_report = json.loads(quant_report_str) if quant_report_str else {}
+          
+          formatted_model = filter_and_format_report(model_report)
+          formatted_quant = filter_and_format_report(quant_report)
+          
+          # Write to files
+          with open('model_ci.txt', 'w') as f:
+              f.write(formatted_model)
+              if formatted_model:
+                  f.write('\n')
+          
+          with open('quantization_ci.txt', 'w') as f:
+              f.write(formatted_quant)
+              if formatted_quant:
+                  f.write('\n')
+          PYTHON_SCRIPT
+
+      - name: Post results as PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+          {
+            echo '## CI Results'
+            echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
+            echo ''
+
+            # Check if both jobs were skipped or cancelled
+            if [[ "${{ needs.model-ci.result }}" == "skipped" || "${{ needs.model-ci.result }}" == "cancelled" ]] && \
+               [[ "${{ needs.quantization-ci.result }}" == "skipped" || "${{ needs.quantization-ci.result }}" == "cancelled" ]]; then
+              echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
+              echo "STATUS=error" >> $GITHUB_ENV
+
+            # Check if either file has content
+            elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
+              echo "STATUS=failure" >> $GITHUB_ENV
+
+              # Check if model_ci.txt has content
+              if [ -s model_ci.txt ]; then
+                echo '### Model CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat model_ci.txt
+                echo ''
+              fi
+              
+              # Check if quantization_ci.txt has content
+              if [ -s quantization_ci.txt ]; then
+                echo '### Quantization CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat quantization_ci.txt
+                echo ''
+              fi
+            else
+              echo "STATUS=success" >> $GITHUB_ENV
+              echo '✅ No failing test specific to this PR 🎉 !'
+            fi
+          } > comment_body.txt
 
-  update_run_status:
-    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
-    permissions:
-      statuses: write
-    if: ${{ always() && needs.create_run.result == 'success' }}
-    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
-    steps:
-      - name: Get `run_models_gpu` job status
-        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ needs.run_quantization_torch_gpu.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
+            -F body=@comment_body.txt
 
       - name: Update PR commit statuses
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ env.STATUS }}"
           gh api \
             --method POST \
             -H "Accept: application/vnd.github+json" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
+            repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \
             -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml
index db808b257f72..d58d927bb59b 100644
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@@ -51,6 +51,7 @@ jobs:
       slack_report_channel: "#transformers-ci-past-future"
       docker: huggingface/transformers-all-latest-torch-nightly-gpu
       ci_event: Nightly CI
+      runner_type: "a10"
       report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
       commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
     secrets: inherit
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index d18428fd0d82..d3de9b70e87c 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -34,14 +34,20 @@ on:
       runner_type:
         required: false
         type: string
-      models:
+      subdirs:
         default: ""
         required: false
         type: string
       pytest_marker:
         required: false
         type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.check_new_failures.outputs.report }}
 
 env:
   HF_HOME: /mnt/cache
@@ -76,6 +82,7 @@ jobs:
       - name: Update clone
         working-directory: /transformers
         run: |
+          git fetch origin ${{ inputs.commit_sha || github.sha }}
           git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
 
       - name: Cleanup
@@ -95,7 +102,7 @@ jobs:
         working-directory: /transformers/tests
         run: |
           if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
             echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
           elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
             echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
@@ -107,7 +114,7 @@ jobs:
         name: Identify quantization method to test
         working-directory: /transformers/tests
         run: |
-          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+          echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(${{ inputs.subdirs || '"None"' }}); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
 
       - name: NVIDIA-SMI
         run: |
@@ -539,16 +546,17 @@ jobs:
     secrets: inherit
 
   check_new_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    if: ${{ always() && needs.send_results.result == 'success' }}
     name: Check new failures
     needs: send_results
     uses: ./.github/workflows/check_failed_tests.yml
     with:
       docker: ${{ inputs.docker }}
-      start_sha: ${{ inputs.commit_sha || github.sha }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
       job: ${{ inputs.job }}
       slack_report_channel: ${{ inputs.slack_report_channel }}
       ci_event: ${{ inputs.ci_event }}
       report_repo_id: ${{ inputs.report_repo_id }}
+      pr_number: ${{ inputs.pr_number }}
 
     secrets: inherit
diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py
index 124aeece0a4c..48bbec64819d 100644
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@@ -151,7 +151,7 @@ def find_bad_commit(target_test, start_commit, end_commit):
 
     bash = f"""
 git bisect reset
-git bisect start {start_commit} {end_commit}
+git bisect start --first-parent {start_commit} {end_commit}
 git bisect run python3 target_script.py
 """
 
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 5ef297f7913c..be6f488165c4 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -1521,6 +1521,16 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
                 token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha
             )
             other_workflow_run_ids.append(other_workflow_run_id)
+    # triggered via `issue_comment` for CI on pull requests (e.g. using the comment `run-slow:`)
+    elif os.environ.get("GITHUB_EVENT_NAME") in ["issue_comment"]:
+        # TODO (ydshieh): Make this flexible once we implement `run-slow` for AMD CI and others.
+        # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it).
+        prev_workflow_id = "90575235"
+        # TODO (ydshieh): It's better to make sure using the last completed scheduled workflow run with the commit being a parent
+        #  of the PR's `merge_commit`.
+        prev_workflow_run_id = get_last_daily_ci_workflow_run_id(
+            token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=prev_workflow_id
+        )
     else:
         prev_workflow_run_id = os.environ["PREV_WORKFLOW_RUN_ID"]
         other_workflow_run_id = os.environ["OTHER_WORKFLOW_RUN_ID"]
diff --git a/utils/pr_slow_ci_models.py b/utils/pr_slow_ci_models.py
index 4f6e80157115..0ac7ceeb70de 100644
--- a/utils/pr_slow_ci_models.py
+++ b/utils/pr_slow_ci_models.py
@@ -27,6 +27,7 @@
 """
 
 import argparse
+import json
 import os.path
 import re
 import string
@@ -169,4 +170,6 @@ def check_model_names(model_name: str):
         elif os.path.isdir(f"tests/quantization/{model}"):
             final_list.append(f"quantization/{model}")
 
-    print(sorted(set(final_list)))
+    # Use `json.dumps` to get the double quotes instead of single quote, e.g. `["model/vit"]`.
+    # (to avoid some shell expansion issues when this script is called from a Github Actions workflow)
+    print(json.dumps(sorted(set(final_list))))
diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py
index 9bf098250131..43bcecadc082 100644
--- a/utils/process_bad_commit_report.py
+++ b/utils/process_bad_commit_report.py
@@ -45,6 +45,25 @@
 
     report_repo_id = os.getenv("REPORT_REPO_ID")
 
+    with open("new_failures_with_bad_commit.json") as fp:
+        data = json.load(fp)
+
+    with open(f"ci_results_{job_name}/job_links.json") as fp:
+        job_links = json.load(fp)
+
+    # Update `new_failures_with_bad_commit.json` with job links information before uploading to Hub repository
+    #   - need to change `single-gpu` to `single` and same for `multi-gpu` to match the keys in `job_link`.
+    for model, model_result in data.items():
+        for device, failed_tests in model_result.items():
+            for failed_test in failed_tests:
+                key = model
+                if list(job_links.keys()) == [job_name]:
+                    key = job_name
+                failed_test["job_link"] = job_links[key][device.replace("-gpu", "")]
+
+    with open("new_failures_with_bad_commit.json", "w") as fp:
+        json.dump(data, fp, indent=4, ensure_ascii=False)
+
     commit_info = api.upload_file(
         path_or_fileobj="new_failures_with_bad_commit.json",
         path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit.json",
@@ -53,12 +72,6 @@
         token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
     )
 
-    with open("new_failures_with_bad_commit.json") as fp:
-        data = json.load(fp)
-
-    with open(f"ci_results_{job_name}/job_links.json") as fp:
-        job_links = json.load(fp)
-
     # TODO: extend
     team_members = [
         "ArthurZucker",
@@ -101,16 +114,7 @@
     for author, _data in new_data_full.items():
         for model, model_result in _data.items():
             for device, failed_tests in model_result.items():
-                # prepare job_link and add it to each entry of new failed test information.
-                # need to change from `single-gpu` to `single` and same for `multi-gpu` to match `job_link`.
-                key = model
-                if list(job_links.keys()) == [job_name]:
-                    key = job_name
-                job_link = job_links[key][device.replace("-gpu", "")]
-
                 failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author]
-                for x in failed_tests:
-                    x.update({"job_link": job_link})
                 model_result[device] = failed_tests
             _data[model] = {k: v for k, v in model_result.items() if len(v) > 0}
         new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0}
diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py
index 6a2aefb293a4..344dc5449f35 100644
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@@ -40,10 +40,10 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--models",
+        "--subdirs",
         type=str,
         default="",
-        help="the list of pre-computed model names.",
+        help="the list of pre-computed model names (directory names under `tests/models`) or directory names under `tests` (except `models`).",
     )
     parser.add_argument(
         "--num_splits",
@@ -60,9 +60,18 @@
     d1.remove("models")
     d = d2 + d1
 
-    if args.models != "":
-        model_tests = ast.literal_eval(args.models)
-        d = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
+    if args.subdirs != "":
+        model_tests = ast.literal_eval(args.subdirs)
+        # We handle both cases with and without prefix because `push-important-models.yml` returns the list without
+        # the prefix (i.e. `models`) but `utils/pr_slow_ci_models.py` (called by `self-comment-ci.yml`) returns the
+        # list with the prefix (`models`) and some directory names under `tests`.
+        d = []
+        for x in model_tests:
+            if os.path.isdir(x):
+                d.append(x)
+            if os.path.isdir(f"models/{x}"):
+                d.append(f"models/{x}")
+        d = sorted(d)
 
     num_jobs = len(d)
     num_jobs_per_splits = num_jobs // args.num_splits

From 0ccb0e3cda5cae010009ddb8d388e1a06cec762c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Sun, 2 Nov 2025 14:20:36 +0100
Subject: [PATCH 23/56] Fix `detectron2` installation in docker files (#41975)

* detectron2 - part 1

* detectron2 - part 2

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 docker/transformers-doc-builder/Dockerfile    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 20c599dc25be..1ff4b07e2976 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -67,7 +67,7 @@ RUN set -e; \
 
 RUN python3 -m pip install --no-cache-dir -U timm
 
-RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
 
 RUN python3 -m pip install --no-cache-dir pytesseract
 
diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile
index bd3d2ce2be16..4a2e0987e139 100644
--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 # Torch needs to be installed before deepspeed
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
 
-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 # Test if the image could successfully build the doc. before publishing the image

From 1eeece53d2572f37a0d46e43298d6a36a91a9cd5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Sun, 2 Nov 2025 18:54:55 +0100
Subject: [PATCH 24/56] Fix `autoawq[kernels]` installation in quantization
 docker file (#41978)

fix autoawq[kernels]

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-quantization-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 18f3927da16c..c59de3916267 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf
 
 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir autoawq[kernels]
+RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]
 
 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto

From e4d0a09bb15a6400c02e246f67cd7228da662562 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Sun, 2 Nov 2025 13:21:41 -0500
Subject: [PATCH 25/56] add support for saving encoder only so any parakeet
 model can be loaded for inference (#41969)

* add support for saving encoder only so any decoder model can be loaded

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* use convolution_bias

* convert modular

* convolution_bias in convertion script

---------

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>
---
 .../configuration_fastspeech2_conformer.py    |   4 +
 .../modeling_fastspeech2_conformer.py         |  16 ++-
 .../models/parakeet/configuration_parakeet.py |   4 +
 .../models/parakeet/convert_nemo_to_hf.py     | 118 ++++++++++++++----
 .../models/parakeet/modeling_parakeet.py      |  16 ++-
 5 files changed, 126 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
index 64c6a4eac8d7..aecfd9f18b2c 100644
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -147,6 +147,8 @@ class FastSpeech2ConformerConfig(PreTrainedConfig):
             Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
         is_encoder_decoder (`bool`, *optional*, defaults to `True`):
             Specifies whether the model is an encoder-decoder.
+        convolution_bias (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use bias in convolutions of the conformer's convolution module.
 
     Example:
 
@@ -224,6 +226,7 @@ def __init__(
         num_languages=None,
         speaker_embed_dim=None,
         is_encoder_decoder=True,
+        convolution_bias=True,
         **kwargs,
     ):
         if positionwise_conv_kernel_size % 2 == 0:
@@ -318,6 +321,7 @@ def __init__(
         self.speaker_embed_dim = speaker_embed_dim
         self.duration_predictor_dropout_rate = duration_predictor_dropout_rate
         self.is_encoder_decoder = is_encoder_decoder
+        self.convolution_bias = convolution_bias
 
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
index 5a2dc39385b3..fa1544a0171c 100644
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -490,12 +490,22 @@ def __init__(self, config: FastSpeech2ConformerConfig, module_config=None):
             kernel_size = module_config["kernel_size"]
             self.activation = ACT2FN[module_config.get("activation", "silu")]
         self.padding = (kernel_size - 1) // 2
-        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pointwise_conv1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=config.convolution_bias
+        )
         self.depthwise_conv = nn.Conv1d(
-            channels, channels, kernel_size, stride=1, padding=self.padding, groups=channels, bias=True
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=self.padding,
+            groups=channels,
+            bias=config.convolution_bias,
         )
         self.norm = nn.BatchNorm1d(channels)
-        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=config.convolution_bias
+        )
 
     def forward(self, hidden_states, attention_mask=None):
         """
diff --git a/src/transformers/models/parakeet/configuration_parakeet.py b/src/transformers/models/parakeet/configuration_parakeet.py
index 1e3d97b4182e..057259b04899 100644
--- a/src/transformers/models/parakeet/configuration_parakeet.py
+++ b/src/transformers/models/parakeet/configuration_parakeet.py
@@ -44,6 +44,8 @@ class ParakeetEncoderConfig(PreTrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler.
         attention_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the attention layers.
+        convolution_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in convolutions of the conformer's convolution module.
         conv_kernel_size (`int`, *optional*, defaults to 9):
             The kernel size of the convolution layers in the Conformer block.
         subsampling_factor (`int`, *optional*, defaults to 8):
@@ -102,6 +104,7 @@ def __init__(
         intermediate_size=4096,
         hidden_act="silu",
         attention_bias=True,
+        convolution_bias=True,
         conv_kernel_size=9,
         subsampling_factor=8,
         subsampling_conv_channels=256,
@@ -128,6 +131,7 @@ def __init__(
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.attention_bias = attention_bias
+        self.convolution_bias = convolution_bias
 
         if (conv_kernel_size - 1) % 2 != 0:
             raise ValueError(f"conv_kernel_size must be odd, got {conv_kernel_size}")
diff --git a/src/transformers/models/parakeet/convert_nemo_to_hf.py b/src/transformers/models/parakeet/convert_nemo_to_hf.py
index f1998fbd81b8..e5cbe7f785db 100644
--- a/src/transformers/models/parakeet/convert_nemo_to_hf.py
+++ b/src/transformers/models/parakeet/convert_nemo_to_hf.py
@@ -25,6 +25,8 @@
 
 from transformers import (
     ParakeetCTCConfig,
+    ParakeetEncoder,
+    ParakeetEncoderConfig,
     ParakeetFeatureExtractor,
     ParakeetForCTC,
     ParakeetProcessor,
@@ -203,7 +205,8 @@ def write_processor(nemo_config: dict, model_files, output_dir, push_to_repo_id=
         processor.push_to_hub(push_to_repo_id)
 
 
-def write_model(nemo_config, model_files, model_type, output_dir, push_to_repo_id=None):
+def convert_encoder_config(nemo_config):
+    """Convert NeMo encoder config to HF encoder config."""
     encoder_keys_to_ignore = [
         "att_context_size",
         "causal_downsampling",
@@ -220,8 +223,11 @@ def write_model(nemo_config, model_files, model_type, output_dir, push_to_repo_i
         "stochastic_depth_mode",
         "conv_context_size",
         "dropout_pre_encoder",
+        "reduction",
+        "reduction_factor",
+        "reduction_position",
     ]
-    enocder_config_keys_mapping = {
+    encoder_config_keys_mapping = {
         "d_model": "hidden_size",
         "n_heads": "num_attention_heads",
         "n_layers": "num_hidden_layers",
@@ -234,17 +240,26 @@ def write_model(nemo_config, model_files, model_type, output_dir, push_to_repo_i
         "dropout_emb": "dropout_positions",
         "dropout_att": "attention_dropout",
         "xscaling": "scale_input",
+        "use_bias": "attention_bias",
     }
     converted_encoder_config = {}
 
     for key, value in nemo_config["encoder"].items():
         if key in encoder_keys_to_ignore:
             continue
-        if key in enocder_config_keys_mapping:
-            converted_encoder_config[enocder_config_keys_mapping[key]] = value
+        if key in encoder_config_keys_mapping:
+            converted_encoder_config[encoder_config_keys_mapping[key]] = value
+            # NeMo uses 'use_bias' for both attention and convolution bias, but HF separates them
+            if key == "use_bias":
+                converted_encoder_config["convolution_bias"] = value
         else:
-            raise ValueError(f"Key {key} not found in enocder_config_keys_mapping")
+            raise ValueError(f"Key {key} not found in encoder_config_keys_mapping")
+
+    return ParakeetEncoderConfig(**converted_encoder_config)
+
 
+def load_and_convert_state_dict(model_files):
+    """Load NeMo state dict and convert keys to HF format."""
     state_dict = torch.load(model_files["model_weights"], map_location="cpu", weights_only=True)
     converted_state_dict = {}
     for key, value in state_dict.items():
@@ -255,31 +270,80 @@ def write_model(nemo_config, model_files, model_type, output_dir, push_to_repo_i
         converted_key = convert_key(key, NEMO_TO_HF_WEIGHT_MAPPING)
         converted_state_dict[converted_key] = value
 
-    if model_type == "ctc":
-        model_config = ParakeetCTCConfig(
-            encoder_config=converted_encoder_config,
-        )
-        print("Loading the checkpoint in a Parakeet CTC model.")
-        with torch.device("meta"):
-            model = ParakeetForCTC(model_config)
-        model.load_state_dict(converted_state_dict, strict=True, assign=True)
-        print("Checkpoint loaded successfully.")
-        del model.config._name_or_path
+    return converted_state_dict
+
+
+def write_ctc_model(encoder_config, converted_state_dict, output_dir, push_to_repo_id=None):
+    """Write CTC model using encoder config and converted state dict."""
+    model_config = ParakeetCTCConfig.from_encoder_config(encoder_config)
+
+    print("Loading the checkpoint in a Parakeet CTC model.")
+    with torch.device("meta"):
+        model = ParakeetForCTC(model_config)
+    model.load_state_dict(converted_state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+    del model.config._name_or_path
+
+    print("Saving the model.")
+    model.save_pretrained(output_dir)
+
+    if push_to_repo_id:
+        model.push_to_hub(push_to_repo_id)
 
-        print("Saving the model.")
-        model.save_pretrained(output_dir)
+    del model
 
-        if push_to_repo_id:
-            model.push_to_hub(push_to_repo_id)
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    ParakeetForCTC.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto")
+    print("Model reloaded successfully.")
 
-        del converted_state_dict, model
 
-        # Safety check: reload the converted model
-        gc.collect()
-        print("Reloading the model to check if it's saved correctly.")
-        ParakeetForCTC.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto")
-        print("Model reloaded successfully.")
+def write_encoder_model(encoder_config, converted_state_dict, output_dir, push_to_repo_id=None):
+    """Write encoder model using encoder config and converted state dict."""
+    # Filter to only encoder weights (exclude CTC head if present)
+    encoder_state_dict = {
+        k.replace("encoder.", "", 1) if k.startswith("encoder.") else k: v
+        for k, v in converted_state_dict.items()
+        if k.startswith("encoder.")
+    }
+
+    print("Loading the checkpoint in a Parakeet Encoder model (for TDT).")
+    with torch.device("meta"):
+        model = ParakeetEncoder(encoder_config)
+
+    model.load_state_dict(encoder_state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+    del model.config._name_or_path
+
+    print("Saving the model.")
+    model.save_pretrained(output_dir)
+
+    if push_to_repo_id:
+        model.push_to_hub(push_to_repo_id)
+    del model
+
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    ParakeetEncoder.from_pretrained(output_dir, dtype=torch.bfloat16, device_map="auto")
+    print("Model reloaded successfully.")
+
 
+def write_model(nemo_config, model_files, model_type, output_dir, push_to_repo_id=None):
+    """Main model conversion function."""
+    # Step 1: Convert encoder config (shared across all model types)
+    encoder_config = convert_encoder_config(nemo_config)
+    print(f"Converted encoder config: {encoder_config}")
+
+    # Step 2: Load and convert state dict (shared across all model types)
+    converted_state_dict = load_and_convert_state_dict(model_files)
+
+    # Step 3: Write model based on type
+    if model_type == "encoder":
+        write_encoder_model(encoder_config, converted_state_dict, output_dir, push_to_repo_id)
+    elif model_type == "ctc":
+        write_ctc_model(encoder_config, converted_state_dict, output_dir, push_to_repo_id)
     else:
         raise ValueError(f"Model type {model_type} not supported.")
 
@@ -303,7 +367,9 @@ def main(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--hf_repo_id", required=True, help="Model repo on huggingface.co")
-    parser.add_argument("--model_type", required=True, choices=["ctc"], help="Model type (`ctc`, `tdt`)")
+    parser.add_argument(
+        "--model_type", required=True, choices=["encoder", "ctc"], help="Model type (`encoder`, `ctc`)"
+    )
     parser.add_argument("--output_dir", required=True, help="Output directory for HuggingFace model")
     parser.add_argument("--push_to_repo_id", help="Repository ID to push the model to on the Hub")
     args = parser.parse_args()
diff --git a/src/transformers/models/parakeet/modeling_parakeet.py b/src/transformers/models/parakeet/modeling_parakeet.py
index 8ca7b7ff37d8..34697507ffc7 100644
--- a/src/transformers/models/parakeet/modeling_parakeet.py
+++ b/src/transformers/models/parakeet/modeling_parakeet.py
@@ -130,12 +130,22 @@ def __init__(self, config: ParakeetEncoderConfig, module_config=None):
             kernel_size = module_config["kernel_size"]
             self.activation = ACT2FN[module_config.get("activation", "silu")]
         self.padding = (kernel_size - 1) // 2
-        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pointwise_conv1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=config.convolution_bias
+        )
         self.depthwise_conv = nn.Conv1d(
-            channels, channels, kernel_size, stride=1, padding=self.padding, groups=channels, bias=True
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=self.padding,
+            groups=channels,
+            bias=config.convolution_bias,
         )
         self.norm = nn.BatchNorm1d(channels)
-        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=config.convolution_bias
+        )
 
     def forward(self, hidden_states, attention_mask=None):
         """

From 09702b28ef9cdd2e784ae8b40d646920407ec26f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Ouazan?=
 <83456801+remi-or@users.noreply.github.com>
Date: Mon, 3 Nov 2025 12:10:24 +0100
Subject: [PATCH 26/56] Use indices as position_ids in modernebert (#41789)

* Use indices as position_ids in modernebert

* Move position_ids init to the branch
---
 src/transformers/models/modernbert/modeling_modernbert.py | 2 ++
 src/transformers/models/modernbert/modular_modernbert.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index c363eaefcf3c..727640ac87c8 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -905,6 +905,8 @@ def forward(
                     inputs_embeds, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
                         inputs=inputs_embeds, attention_mask=attention_mask
                     )
+            if position_ids is None:
+                position_ids = indices.unsqueeze(0)
         else:
             if position_ids is None:
                 position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index 9e535d345f2f..6dd1a547a320 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -1014,6 +1014,8 @@ def forward(
                     inputs_embeds, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
                         inputs=inputs_embeds, attention_mask=attention_mask
                     )
+            if position_ids is None:
+                position_ids = indices.unsqueeze(0)
         else:
             if position_ids is None:
                 position_ids = torch.arange(seq_len, device=device).unsqueeze(0)

From 688a79c25d215723377b5c3ee17405958899dcdb Mon Sep 17 00:00:00 2001
From: Ferdinand Mom <47445085+3outeille@users.noreply.github.com>
Date: Mon, 3 Nov 2025 13:56:26 +0100
Subject: [PATCH 27/56] test tensor parallel: make tests for dense model more
 robust (#41968)

* make test forward and backward more robust

* refactor compile part of test tensor parallel

* linting

* pass rank around instead of calling it over and over

* Run slow v2 (#41914)

* Super

* Super

* Super

* Super

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

* Fix `detectron2` installation in docker files (#41975)

* detectron2 - part 1

* detectron2 - part 2

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

* Fix `autoawq[kernels]` installation in quantization docker file (#41978)

fix autoawq[kernels]

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

* add support for saving encoder only so any parakeet model can be loaded for inference (#41969)

* add support for saving encoder only so any decoder model can be loaded

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* use convolution_bias

* convert modular

* convolution_bias in convertion script

---------

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>

---------

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>
---
 tests/tensor_parallel/test_tensor_parallel.py | 241 ++++++++++++------
 1 file changed, 169 insertions(+), 72 deletions(-)

diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py
index 35fb538fff7a..05ec7e1a8d07 100644
--- a/tests/tensor_parallel/test_tensor_parallel.py
+++ b/tests/tensor_parallel/test_tensor_parallel.py
@@ -15,8 +15,8 @@
 # Run all tests: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py
 # Run specific config: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py -k "2Proc"
 # Run multiple configs: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py -k "2Proc or 4Proc"
-# Run spefic test: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py::TestTensorParallel2Proc::test_model_forward
-
+# Run spefic test: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py::TestTensorParallel2Proc::test_model_dense_forward_train
+# Run tests with a specific prefix: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py::TestTensorParallel2Proc -k "forward"
 import os
 import tempfile
 import warnings
@@ -24,7 +24,7 @@
 from safetensors import safe_open
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, is_torch_available
-from transformers.integrations.tensor_parallel import get_packed_weights, repack_weights
+from transformers.integrations.tensor_parallel import get_packed_weights, get_tensor_shard, repack_weights
 from transformers.testing_utils import (
     TestCasePlus,
     backend_device_count,
@@ -37,6 +37,7 @@
 
 if is_torch_available():
     import torch
+    import torch.distributed as dist
     import torch.multiprocessing as mp
 
 
@@ -53,14 +54,14 @@ def setup_dist_env(rank, world_size, port):
 
     if torch.cuda.is_available():
         torch.cuda.set_device(rank)
-        torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
     else:
-        torch.distributed.init_process_group(backend="gloo", rank=rank, world_size=world_size)
+        dist.init_process_group(backend="gloo", rank=rank, world_size=world_size)
 
     func(rank, *func_args, **func_kwargs)
 
-    torch.distributed.barrier()
-    torch.distributed.destroy_process_group()
+    dist.barrier()
+    dist.destroy_process_group()
 
 
 def init_distributed(tp: int):
@@ -211,95 +212,169 @@ def test_tp_plan_none_handling(self):
 
 
 # ====== TEST FUNCTIONS ======
-def _test_model_forward_impl(rank):
-    """Implementation of test_model_forward for distributed execution."""
+def _test_model_dense_forward_impl(rank, mode):
+    """Implementation for comparing TP and non-TP model outputs."""
     model_id = "JackFram/llama-68m"
 
-    int(os.environ["RANK"])
-    int(os.environ["WORLD_SIZE"])
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
-    torch.distributed.barrier()
-
-    has_dtensor = 0
-    for name, parameter in model.named_parameters():
-        if isinstance(parameter.data, torch.distributed.tensor.DTensor):
-            has_dtensor = 1
-            break
-
-    assert has_dtensor == 1, "TP model must has DTensor"
+    # Ensure same random seed for reproducibility
+    torch.manual_seed(0)
 
+    # Load tokenizer and prepare inputs - same for both models
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
     prompt = "Can I help"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Load TP model first to determine device
+    model_tp = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
+    dist.barrier()
+    if mode == "eval":
+        model_tp.eval()
+    else:
+        model_tp.train()
+
+    # Load non-TP model and move to same device as TP model
+    device = model_tp.device
+    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+    model = model.to(device)
+
+    if mode == "eval":
+        model.eval()
+    else:
+        model.train()
+
+    # Prepare inputs on the same device
+    input_ids = inputs.input_ids.to(device)
+
+    # Run forward pass on both models
+    with torch.no_grad():
+        # Non-TP model output
+        outputs = model(input_ids)
+        logits = outputs.logits
+
+        # TP model output
+        outputs_tp = model_tp(input_ids)
+        logits_tp = outputs_tp.logits
 
-    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-    outputs = model(inputs)
+    # Compare outputs - they should match
+    assert torch.allclose(logits, logits_tp, atol=1e-5, rtol=1e-5), (
+        f"TP and non-TP model outputs differ. Max diff: {(logits - logits_tp).abs().max().item()} | Min diff: {(logits - logits_tp).abs().min().item()}"
+    )
 
-    next_token_logits = outputs[0][:, -1, :]
-    next_token = torch.argmax(next_token_logits, dim=-1)
-    response = tokenizer.decode(next_token)
-    assert response == "with"
-    print("response:", response)
-    torch.distributed.barrier()
+    dist.barrier()
 
 
-def _test_model_backward_pass_impl(rank):
-    """Implementation of test_model_backward_pass for distributed execution."""
+def _test_model_dense_backward_pass_impl(rank):
+    """Implementation for comparing TP and non-TP model backward passes."""
     model_id = "JackFram/llama-68m"
 
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32, tp_plan="auto")
-    torch.distributed.barrier()
+    torch.manual_seed(0)
 
-    # Dummy forward and backward pass
-    # Note that loss.backward() will fail if there is a bug in the TP implementation
-    inputs = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
-    labels = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
-    loss = model(inputs, labels=labels).loss
+    model_tp = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32, tp_plan="auto")
+    dist.barrier()
+    model_tp.train()
+
+    device = model_tp.device
+    model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32)
+    model = model.to(device)
+    model.train()
+
+    batch_size, seq_length = 2, 10
+    torch.manual_seed(42)  # Different seed for inputs to ensure they're deterministic
+    input_ids = torch.randint(0, model.config.vocab_size, (batch_size, seq_length), device=device)
+    labels = torch.randint(0, model.config.vocab_size, (batch_size, seq_length), device=device)
+
+    outputs = model(input_ids, labels=labels)
+    loss = outputs.loss
     loss.backward()
 
-    torch.distributed.barrier()
+    outputs_tp = model_tp(input_ids, labels=labels)
+    loss_tp = outputs_tp.loss
+    loss_tp.backward()
 
+    assert torch.allclose(loss, loss_tp, atol=1e-5, rtol=1e-5), (
+        f"TP and non-TP model losses differ. Non-TP loss: {loss.item()}, TP loss: {loss_tp.item()}, Diff: {(loss - loss_tp).abs().item()}"
+    )
 
-def _test_model_generate_impl(rank):
-    """Implementation of test_model_generate for distributed execution."""
-    model_id = "JackFram/llama-68m"
+    # Compare gradients for matching parameters
+    # Note: TP model may have sharded parameters (DTensors), so we slice the reference gradient to match
+    for (name, param), (name_tp, param_tp) in zip(model.named_parameters(), model_tp.named_parameters()):
+        if param.grad is not None and param_tp.grad is not None:
+            grad = param.grad
+            grad_tp = param_tp.grad
 
-    int(os.environ["RANK"])
-    int(os.environ["WORLD_SIZE"])
+            if isinstance(param_tp.data, dist.tensor.DTensor):
+                placement = param_tp.data.placements[0]
+                if hasattr(placement, "dim") and placement.dim is not None:
+                    grad_shard = get_tensor_shard(grad, grad, param_tp.data.device_mesh, rank, placement.dim)
+                else:
+                    grad_shard = grad
+            else:
+                grad_shard = grad
 
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
-    torch.distributed.barrier()
+            grad_tp_local = grad_tp.to_local() if isinstance(grad_tp, dist.tensor.DTensor) else grad_tp
 
-    model.forward = torch.compile(model.forward)
+            assert torch.allclose(grad_shard.cpu(), grad_tp_local.cpu(), atol=1e-5, rtol=1e-5), (
+                f"Gradients differ for parameter {name}. Max diff: {(grad_shard.cpu() - grad_tp_local.cpu()).abs().max().item()} | Min diff: {(grad_shard.cpu() - grad_tp_local.cpu()).abs().min().item()}"
+            )
 
-    has_dtensor = 0
-    for name, parameter in model.named_parameters():
-        if isinstance(parameter.data, torch.distributed.tensor.DTensor):
-            has_dtensor = 1
-            break
+    dist.barrier()
 
-    assert has_dtensor == 1, "TP model must has DTensor"
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+def _test_model_dense_forward_compile_impl(rank, mode):
+    """Implementation for comparing TP and non-TP model outputs with torch.compile."""
+    model_id = "JackFram/llama-68m"
+
+    torch.manual_seed(0)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
     prompt = "Can I help"
+    inputs = tokenizer(prompt, return_tensors="pt")
 
-    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=10, cache_implementation="static")
+    model_tp = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
+    dist.barrier()
+    if mode == "eval":
+        model_tp.eval()
+    else:
+        model_tp.train()
 
-    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    assert output_text[0].startswith(prompt), f"Expected output to start with '{prompt}', got '{output_text[0]}'"
+    device = model_tp.device
+    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+    model = model.to(device)
 
-    torch.distributed.barrier()
+    if mode == "eval":
+        model.eval()
+    else:
+        model.train()
 
+    # Compile both models
+    model.forward = torch.compile(model.forward)
+    model_tp.forward = torch.compile(model_tp.forward)
+
+    input_ids = inputs.input_ids.to(device)
+
+    with torch.no_grad():
+        outputs = model(input_ids)
+        logits = outputs.logits
+
+        outputs_tp = model_tp(input_ids)
+        logits_tp = outputs_tp.logits
+
+    assert torch.allclose(logits, logits_tp, atol=1e-5, rtol=1e-5), (
+        f"TP and non-TP model outputs differ. Max diff: {(logits - logits_tp).abs().max().item()} | Min diff: {(logits - logits_tp).abs().min().item()}"
+    )
+
+    dist.barrier()
 
-def _test_model_save_impl(rank, tmp_dir, is_torchrun):
+
+def _test_model_dense_save_impl(rank, tmp_dir):
     """Implementation of test_model_save for distributed execution."""
     model_id = "JackFram/llama-68m"
-    kwargs = {}
 
-    if os.environ.get("RANK", None) is not None:
-        kwargs["tp_plan"] = "auto"
+    if dist.is_initialized():
+        kwargs = {"tp_plan": "auto"}
         result_dir = f"{tmp_dir}/tp"
     else:
+        kwargs = {}
         result_dir = f"{tmp_dir}/nontp"
 
     model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
@@ -312,35 +387,57 @@ class TestTensorParallelBase(TestCasePlus):
     nproc_per_node = None
 
     @require_torch_multi_accelerator
-    def test_model_forward(self):
+    def test_model_dense_forward_eval(self):
+        """Test that TP and non-TP models produce the same outputs in eval mode."""
+        if self.nproc_per_node is None:
+            self.skipTest("nproc_per_node not set")
+        if backend_device_count(torch_device) < self.nproc_per_node:
+            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
+
+        init_distributed(tp=self.nproc_per_node)(_test_model_dense_forward_impl)("eval")
+
+    @require_torch_multi_accelerator
+    def test_model_dense_forward_train(self):
+        """Test that TP and non-TP models produce the same outputs in train mode."""
+        if self.nproc_per_node is None:
+            self.skipTest("nproc_per_node not set")
+        if backend_device_count(torch_device) < self.nproc_per_node:
+            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
+
+        init_distributed(tp=self.nproc_per_node)(_test_model_dense_forward_impl)("train")
+
+    @require_torch_multi_accelerator
+    def test_model_dense_backward_pass(self):
         if self.nproc_per_node is None:
             self.skipTest("nproc_per_node not set")
         if backend_device_count(torch_device) < self.nproc_per_node:
             self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
 
-        init_distributed(tp=self.nproc_per_node)(_test_model_forward_impl)()
+        init_distributed(tp=self.nproc_per_node)(_test_model_dense_backward_pass_impl)()
 
     @require_torch_multi_accelerator
-    def test_model_backward_pass(self):
+    def test_model_dense_forward_compile_eval(self):
+        """Test that TP and non-TP models produce the same outputs with torch.compile in eval mode."""
         if self.nproc_per_node is None:
             self.skipTest("nproc_per_node not set")
         if backend_device_count(torch_device) < self.nproc_per_node:
             self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
 
-        init_distributed(tp=self.nproc_per_node)(_test_model_backward_pass_impl)()
+        init_distributed(tp=self.nproc_per_node)(_test_model_dense_forward_compile_impl)("eval")
 
     @require_torch_multi_accelerator
-    def test_model_generate(self):
+    def test_model_dense_forward_compile_train(self):
+        """Test that TP and non-TP models produce the same outputs with torch.compile in train mode."""
         if self.nproc_per_node is None:
             self.skipTest("nproc_per_node not set")
         if backend_device_count(torch_device) < self.nproc_per_node:
             self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
 
-        init_distributed(tp=self.nproc_per_node)(_test_model_generate_impl)()
+        init_distributed(tp=self.nproc_per_node)(_test_model_dense_forward_compile_impl)("train")
 
     @require_huggingface_hub_greater_or_equal("0.31.4")
     @require_torch_multi_accelerator
-    def test_model_save(self):
+    def test_model_dense_save(self):
         if self.nproc_per_node is None:
             self.skipTest("nproc_per_node not set")
         if backend_device_count(torch_device) < self.nproc_per_node:
@@ -348,10 +445,10 @@ def test_model_save(self):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             # First run with TP (distributed)
-            init_distributed(tp=self.nproc_per_node)(_test_model_save_impl)(tmp_dir, True)
+            init_distributed(tp=self.nproc_per_node)(_test_model_dense_save_impl)(tmp_dir)
 
             # Then run without TP (non-distributed)
-            _test_model_save_impl(0, tmp_dir, False)
+            _test_model_dense_save_impl(0, tmp_dir)
 
             non_tp_model_path = os.path.join(tmp_dir, "nontp")
             tp_model_path = os.path.join(tmp_dir, "tp")

From e44838b768219628dab3af835173651714dea4e2 Mon Sep 17 00:00:00 2001
From: Ryan Mullins <ryanmullins@google.com>
Date: Mon, 3 Nov 2025 09:09:27 -0500
Subject: [PATCH 28/56] fix: dict[RopeParameters] to dict[str, RopeParameters]
 (#41963)

---
 src/transformers/models/arcee/configuration_arcee.py   |  2 +-
 src/transformers/models/arcee/modular_arcee.py         |  2 +-
 src/transformers/models/aria/configuration_aria.py     |  2 +-
 src/transformers/models/bitnet/configuration_bitnet.py |  2 +-
 src/transformers/models/blt/configuration_blt.py       | 10 +++++-----
 .../models/chameleon/configuration_chameleon.py        |  2 +-
 src/transformers/models/cohere/configuration_cohere.py |  2 +-
 .../models/cohere2/configuration_cohere2.py            |  2 +-
 src/transformers/models/cohere2/modular_cohere2.py     |  2 +-
 src/transformers/models/csm/configuration_csm.py       |  4 ++--
 src/transformers/models/dbrx/configuration_dbrx.py     |  2 +-
 .../models/deepseek_v2/configuration_deepseek_v2.py    |  2 +-
 .../models/deepseek_v2/modular_deepseek_v2.py          |  2 +-
 .../models/deepseek_v3/configuration_deepseek_v3.py    |  2 +-
 .../models/diffllama/configuration_diffllama.py        |  2 +-
 src/transformers/models/doge/configuration_doge.py     |  2 +-
 src/transformers/models/doge/modular_doge.py           |  2 +-
 src/transformers/models/dots1/configuration_dots1.py   |  2 +-
 .../models/ernie4_5/configuration_ernie4_5.py          |  2 +-
 .../models/ernie4_5_moe/configuration_ernie4_5_moe.py  |  2 +-
 src/transformers/models/evolla/configuration_evolla.py |  2 +-
 .../models/exaone4/configuration_exaone4.py            |  2 +-
 src/transformers/models/exaone4/modular_exaone4.py     |  2 +-
 src/transformers/models/falcon/configuration_falcon.py |  2 +-
 .../models/falcon_h1/configuration_falcon_h1.py        |  2 +-
 .../models/flex_olmo/configuration_flex_olmo.py        |  2 +-
 src/transformers/models/flex_olmo/modular_flex_olmo.py |  2 +-
 src/transformers/models/fuyu/configuration_fuyu.py     |  2 +-
 src/transformers/models/gemma/configuration_gemma.py   |  2 +-
 src/transformers/models/gemma/modular_gemma.py         |  2 +-
 src/transformers/models/gemma2/configuration_gemma2.py |  2 +-
 src/transformers/models/gemma2/modular_gemma2.py       |  2 +-
 .../models/gemma3n/configuration_gemma3n.py            |  2 +-
 src/transformers/models/gemma3n/modular_gemma3n.py     |  2 +-
 src/transformers/models/glm/configuration_glm.py       |  2 +-
 src/transformers/models/glm4/configuration_glm4.py     |  2 +-
 .../models/glm4_moe/configuration_glm4_moe.py          |  2 +-
 src/transformers/models/glm4_moe/modular_glm4_moe.py   |  2 +-
 src/transformers/models/glm4v/configuration_glm4v.py   |  2 +-
 src/transformers/models/glm4v/modular_glm4v.py         |  2 +-
 .../models/glm4v_moe/configuration_glm4v_moe.py        |  2 +-
 src/transformers/models/glm4v_moe/modular_glm4v_moe.py |  2 +-
 .../models/gpt_neox/configuration_gpt_neox.py          |  2 +-
 .../configuration_gpt_neox_japanese.py                 |  2 +-
 .../models/granite/configuration_granite.py            |  2 +-
 .../models/granitemoe/configuration_granitemoe.py      |  2 +-
 .../granitemoehybrid/configuration_granitemoehybrid.py |  2 +-
 .../granitemoeshared/configuration_granitemoeshared.py |  2 +-
 src/transformers/models/helium/configuration_helium.py |  2 +-
 .../hunyuan_v1_dense/configuration_hunyuan_v1_dense.py |  2 +-
 .../hunyuan_v1_moe/configuration_hunyuan_v1_moe.py     |  2 +-
 src/transformers/models/jetmoe/configuration_jetmoe.py |  2 +-
 .../configuration_kyutai_speech_to_text.py             |  2 +-
 src/transformers/models/lfm2/configuration_lfm2.py     |  2 +-
 src/transformers/models/llama/configuration_llama.py   |  2 +-
 src/transformers/models/llama4/configuration_llama4.py |  4 ++--
 .../longcat_flash/configuration_longcat_flash.py       |  2 +-
 src/transformers/models/mimi/configuration_mimi.py     |  2 +-
 .../models/minimax/configuration_minimax.py            |  2 +-
 src/transformers/models/minimax/modular_minimax.py     |  2 +-
 .../models/mistral/configuration_mistral.py            |  2 +-
 .../models/mixtral/configuration_mixtral.py            |  2 +-
 .../models/modernbert/configuration_modernbert.py      |  2 +-
 .../models/modernbert/modular_modernbert.py            |  2 +-
 .../configuration_modernbert_decoder.py                |  2 +-
 .../modernbert_decoder/modular_modernbert_decoder.py   |  2 +-
 .../models/moonshine/configuration_moonshine.py        |  2 +-
 src/transformers/models/moonshine/modular_moonshine.py |  2 +-
 src/transformers/models/moshi/configuration_moshi.py   |  2 +-
 .../models/nemotron/configuration_nemotron.py          |  2 +-
 src/transformers/models/olmo/configuration_olmo.py     |  2 +-
 src/transformers/models/olmo2/configuration_olmo2.py   |  2 +-
 src/transformers/models/olmo2/modular_olmo2.py         |  2 +-
 src/transformers/models/olmo3/configuration_olmo3.py   |  2 +-
 src/transformers/models/olmo3/modular_olmo3.py         |  2 +-
 src/transformers/models/olmoe/configuration_olmoe.py   |  2 +-
 .../models/persimmon/configuration_persimmon.py        |  2 +-
 src/transformers/models/phi/configuration_phi.py       |  2 +-
 src/transformers/models/phi3/configuration_phi3.py     |  2 +-
 .../phi4_multimodal/configuration_phi4_multimodal.py   |  2 +-
 .../models/phi4_multimodal/modular_phi4_multimodal.py  |  2 +-
 src/transformers/models/phimoe/configuration_phimoe.py |  2 +-
 .../models/pixtral/configuration_pixtral.py            |  2 +-
 src/transformers/models/qwen2/configuration_qwen2.py   |  2 +-
 .../models/qwen2_5_omni/configuration_qwen2_5_omni.py  |  6 +++---
 .../models/qwen2_5_omni/modular_qwen2_5_omni.py        |  6 +++---
 .../models/qwen2_5_vl/configuration_qwen2_5_vl.py      |  2 +-
 .../models/qwen2_moe/configuration_qwen2_moe.py        |  2 +-
 .../models/qwen2_vl/configuration_qwen2_vl.py          |  2 +-
 src/transformers/models/qwen3/configuration_qwen3.py   |  2 +-
 .../models/qwen3_moe/configuration_qwen3_moe.py        |  2 +-
 .../models/qwen3_next/configuration_qwen3_next.py      |  2 +-
 .../qwen3_omni_moe/configuration_qwen3_omni_moe.py     |  4 ++--
 .../models/qwen3_omni_moe/modular_qwen3_omni_moe.py    |  4 ++--
 .../models/qwen3_vl/configuration_qwen3_vl.py          |  2 +-
 src/transformers/models/qwen3_vl/modular_qwen3_vl.py   |  2 +-
 .../recurrent_gemma/configuration_recurrent_gemma.py   |  2 +-
 .../models/seed_oss/configuration_seed_oss.py          |  2 +-
 .../models/smollm3/configuration_smollm3.py            |  2 +-
 src/transformers/models/smollm3/modular_smollm3.py     |  2 +-
 .../models/stablelm/configuration_stablelm.py          |  2 +-
 .../models/starcoder2/configuration_starcoder2.py      |  2 +-
 .../models/t5gemma/configuration_t5gemma.py            |  2 +-
 src/transformers/models/t5gemma/modular_t5gemma.py     |  2 +-
 .../models/vaultgemma/configuration_vaultgemma.py      |  2 +-
 .../models/vaultgemma/modular_vaultgemma.py            |  2 +-
 src/transformers/models/zamba2/configuration_zamba2.py |  2 +-
 107 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py
index b4e23ffb3b8f..b9892eaf8b61 100644
--- a/src/transformers/models/arcee/configuration_arcee.py
+++ b/src/transformers/models/arcee/configuration_arcee.py
@@ -136,7 +136,7 @@ def __init__(
         bos_token_id: Optional[int] = 128000,
         eos_token_id: Optional[int] = 128001,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/arcee/modular_arcee.py b/src/transformers/models/arcee/modular_arcee.py
index cb75888957d8..09fbe14d291f 100644
--- a/src/transformers/models/arcee/modular_arcee.py
+++ b/src/transformers/models/arcee/modular_arcee.py
@@ -137,7 +137,7 @@ def __init__(
         bos_token_id: Optional[int] = 128000,
         eos_token_id: Optional[int] = 128001,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py
index 78669c78bcbb..192d78776679 100644
--- a/src/transformers/models/aria/configuration_aria.py
+++ b/src/transformers/models/aria/configuration_aria.py
@@ -134,7 +134,7 @@ def __init__(
         eos_token_id: Optional[int] = 2,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py
index 1bcf84d0c6c4..0473ad6ac407 100644
--- a/src/transformers/models/bitnet/configuration_bitnet.py
+++ b/src/transformers/models/bitnet/configuration_bitnet.py
@@ -117,7 +117,7 @@ def __init__(
         tie_word_embeddings: Optional[bool] = False,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[str] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py
index b20ae8c6dad3..7459346645ea 100644
--- a/src/transformers/models/blt/configuration_blt.py
+++ b/src/transformers/models/blt/configuration_blt.py
@@ -44,7 +44,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-5,
         dropout: Optional[float] = 0.0,
         max_position_embeddings: Optional[int] = 24576,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         hidden_act: Optional[str] = "silu",
         intermediate_size: Optional[int] = 2816,
         initializer_range: Optional[float] = 0.02,
@@ -99,7 +99,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-5,
         dropout: Optional[float] = 0.0,
         max_position_embeddings: Optional[int] = 24576,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         hidden_act: Optional[str] = "silu",
         intermediate_size: Optional[int] = 2816,
         initializer_range: Optional[float] = 0.02,
@@ -150,7 +150,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-5,
         dropout: Optional[float] = 0.0,
         max_position_embeddings: Optional[int] = 4096,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         hidden_act: Optional[str] = "silu",
         intermediate_size: Optional[int] = 5632,
         initializer_range: Optional[float] = 0.02,
@@ -231,7 +231,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-5,
         dropout: Optional[float] = 0.0,
         intermediate_size: Optional[int] = 2048,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         initializer_range: Optional[float] = 0.02,
         **kwargs,
     ):
@@ -356,7 +356,7 @@ def __init__(
         global_config: Optional[dict] = None,
         tie_word_embeddings: Optional[bool] = False,
         initializer_range: Optional[float] = 0.02,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         # Basic model configuration
diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py
index 72e6eccca2a3..bfa8a9f33469 100644
--- a/src/transformers/models/chameleon/configuration_chameleon.py
+++ b/src/transformers/models/chameleon/configuration_chameleon.py
@@ -203,7 +203,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[int] = False,
         attention_dropout: Optional[float] = 0.0,
         model_parallel_size: Optional[int] = 1,
diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py
index ac75ea93c864..18afd5fd32e9 100644
--- a/src/transformers/models/cohere/configuration_cohere.py
+++ b/src/transformers/models/cohere/configuration_cohere.py
@@ -139,7 +139,7 @@ def __init__(
         bos_token_id: Optional[int] = 5,
         eos_token_id: Optional[int] = 255001,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         use_qk_norm: Optional[bool] = False,
diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py
index 7bf87307ee1d..910dc6dcb80a 100644
--- a/src/transformers/models/cohere2/configuration_cohere2.py
+++ b/src/transformers/models/cohere2/configuration_cohere2.py
@@ -138,7 +138,7 @@ def __init__(
         bos_token_id: Optional[int] = 5,
         eos_token_id: Optional[int] = 255001,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         sliding_window: Optional[int] = 4096,
diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
index dab998730c77..af9fa871f391 100644
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@@ -162,7 +162,7 @@ def __init__(
         bos_token_id: Optional[int] = 5,
         eos_token_id: Optional[int] = 255001,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         sliding_window: Optional[int] = 4096,
diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py
index 227609c2f1aa..ce1ad2dd5993 100644
--- a/src/transformers/models/csm/configuration_csm.py
+++ b/src/transformers/models/csm/configuration_csm.py
@@ -122,7 +122,7 @@ def __init__(
         pad_token_id: Optional[int] = None,
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
@@ -291,7 +291,7 @@ def __init__(
         eos_token_id: Optional[int] = None,
         audio_token_id: Optional[int] = 128002,
         audio_eos_token_id: Optional[int] = 128003,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 987cb8a8ac06..82182c49bd3f 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -189,7 +189,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         initializer_range: Optional[float] = 0.02,
         output_router_logits: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs: Any,
     ):
         if attn_config is None:
diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py
index 7e5a8c93feec..aad76507d3a6 100644
--- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py
+++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py
@@ -154,7 +154,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py
index 0a5e1a8b4f06..7e60d5c858b3 100644
--- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py
+++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py
@@ -167,7 +167,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
index eed1ea34def4..f90c5e175ba5 100644
--- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
+++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
@@ -186,7 +186,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         rope_interleave: Optional[bool] = True,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py
index 0eac1f506c72..cbfb5fea5160 100644
--- a/src/transformers/models/diffllama/configuration_diffllama.py
+++ b/src/transformers/models/diffllama/configuration_diffllama.py
@@ -118,7 +118,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         lambda_std_dev: Optional[float] = 0.1,
diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py
index 844b9519b45a..db2d3014d978 100644
--- a/src/transformers/models/doge/configuration_doge.py
+++ b/src/transformers/models/doge/configuration_doge.py
@@ -148,7 +148,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
         max_position_embeddings: Optional[int] = 2048,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         num_attention_heads: Optional[int] = 8,
         num_key_value_heads: Optional[int] = None,
         attention_bias: Optional[bool] = False,
diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py
index 52603d99dcd4..fd71f7479f6b 100644
--- a/src/transformers/models/doge/modular_doge.py
+++ b/src/transformers/models/doge/modular_doge.py
@@ -176,7 +176,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
         max_position_embeddings: Optional[int] = 2048,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         num_attention_heads: Optional[int] = 8,
         num_key_value_heads: Optional[int] = None,
         attention_bias: Optional[bool] = False,
diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py
index db524dd5789c..a5755ad0a45f 100644
--- a/src/transformers/models/dots1/configuration_dots1.py
+++ b/src/transformers/models/dots1/configuration_dots1.py
@@ -159,7 +159,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         routed_scaling_factor: Optional[float] = 1.0,
diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py
index 03aefe766cf6..346eff50e9f2 100644
--- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py
+++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py
@@ -125,7 +125,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_bias: Optional[bool] = False,
         head_dim: Optional[int] = 128,
         **kwargs,
diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py
index 0fd108a28b40..19ed1853db33 100644
--- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py
+++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py
@@ -161,7 +161,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_bias: Optional[int] = False,
         moe_intermediate_size: Optional[int] = 1536,
         moe_k: Optional[int] = 6,
diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py
index 218bc50ad964..4dab03fb9314 100644
--- a/src/transformers/models/evolla/configuration_evolla.py
+++ b/src/transformers/models/evolla/configuration_evolla.py
@@ -203,7 +203,7 @@ def __init__(
         hidden_act: Optional[str] = "silu",  # llama activation function
         max_position_embeddings: Optional[int] = 8192,  # llama rope max length
         rms_norm_eps: Optional[int] = 1e-05,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py
index 68bdaf5ce9b3..a968bcc6f07b 100644
--- a/src/transformers/models/exaone4/configuration_exaone4.py
+++ b/src/transformers/models/exaone4/configuration_exaone4.py
@@ -143,7 +143,7 @@ def __init__(
         bos_token_id: Optional[int] = 0,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_dropout: Optional[float] = 0.0,
         sliding_window: Optional[int] = 4096,
         sliding_window_pattern: Optional[int] = 4,
diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py
index d03510d54d46..4ddc3466ffd9 100644
--- a/src/transformers/models/exaone4/modular_exaone4.py
+++ b/src/transformers/models/exaone4/modular_exaone4.py
@@ -176,7 +176,7 @@ def __init__(
         bos_token_id: Optional[int] = 0,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_dropout: Optional[float] = 0.0,
         sliding_window: Optional[int] = 4096,
         sliding_window_pattern: Optional[int] = 4,
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index 2a6da686b72e..3e7b437954dc 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -128,7 +128,7 @@ def __init__(
         parallel_attn: Optional[bool] = True,
         bias: Optional[bool] = False,
         max_position_embeddings: Optional[int] = 2048,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         bos_token_id: Optional[int] = 11,
         eos_token_id: Optional[int] = 11,
         ffn_hidden_size: Optional[int] = None,
diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py
index 85a7e76f3901..6ba590f15025 100644
--- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py
@@ -164,7 +164,7 @@ def __init__(
         mamba_norm_before_gate: Optional[bool] = True,
         mamba_rms_norm: Optional[bool] = False,
         projectors_bias: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         lm_head_multiplier: Optional[float] = 1.0,
         embedding_multiplier: Optional[float] = 1.0,
         mlp_multipliers: Optional[int] = None,
diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py
index 0f0f63f2916b..515301b93c0c 100644
--- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py
+++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py
@@ -141,7 +141,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 100257,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         num_experts_per_tok: Optional[int] = 5,
diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py
index e5c738aa4bc5..f6cee224c0ee 100644
--- a/src/transformers/models/flex_olmo/modular_flex_olmo.py
+++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py
@@ -152,7 +152,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 100257,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         num_experts_per_tok: Optional[int] = 5,
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index ae3b692cb474..bbe4a5ec22d8 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -118,7 +118,7 @@ def __init__(
         layer_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         qk_layernorm: Optional[bool] = True,
         hidden_dropout: Optional[float] = 0.0,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index 986ab2c9aa94..a2c6ac12f008 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -131,7 +131,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         use_bidirectional_attention: Optional[bool] = None,
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
index cc4cf066958a..aa64cc9e63e8 100644
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@@ -158,7 +158,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         use_bidirectional_attention: Optional[bool] = None,
diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py
index 7fa77dbb8347..460fb7000354 100644
--- a/src/transformers/models/gemma2/configuration_gemma2.py
+++ b/src/transformers/models/gemma2/configuration_gemma2.py
@@ -142,7 +142,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index 411c75ac516a..4e36cc22e030 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -171,7 +171,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
index cbc0e890d9cc..796822cf4e37 100644
--- a/src/transformers/models/gemma3n/configuration_gemma3n.py
+++ b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -177,7 +177,7 @@ def __init__(
         pad_token_id: int = 0,
         eos_token_id: int = 1,
         bos_token_id: int = 2,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: bool = False,
         attention_dropout: float = 0.0,
         sliding_window: int = 512,
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index adbcd029d7c2..6d431e9acc55 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -187,7 +187,7 @@ def __init__(
         pad_token_id: int = 0,
         eos_token_id: int = 1,
         bos_token_id: int = 2,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: bool = False,
         attention_dropout: float = 0.0,
         sliding_window: int = 512,
diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index 63685ce76729..e0d2c3d6492a 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -121,7 +121,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 0.00000015625,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         pad_token_id: Optional[int] = 151329,
         eos_token_id: Optional[list[int]] = [151329, 151336, 151338],
         bos_token_id: Optional[int] = None,
diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py
index 026658fa0793..43e6323b0060 100644
--- a/src/transformers/models/glm4/configuration_glm4.py
+++ b/src/transformers/models/glm4/configuration_glm4.py
@@ -122,7 +122,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 0.00000015625,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         pad_token_id: Optional[int] = 151329,
         eos_token_id: Optional[list[int]] = [151329, 151336, 151338],
         bos_token_id: Optional[int] = None,
diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py
index a35dec5f4e3f..33d9afd756e5 100644
--- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py
+++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py
@@ -152,7 +152,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         moe_intermediate_size: Optional[int] = 1408,
diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py
index db1f22e58e45..471d06d69ff9 100644
--- a/src/transformers/models/glm4_moe/modular_glm4_moe.py
+++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py
@@ -166,7 +166,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         moe_intermediate_size: Optional[int] = 1408,
diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py
index e316c14079bd..c8f2ef75ca71 100644
--- a/src/transformers/models/glm4v/configuration_glm4v.py
+++ b/src/transformers/models/glm4v/configuration_glm4v.py
@@ -220,7 +220,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         image_token_id: Optional[int] = None,
         video_token_id: Optional[int] = None,
         **kwargs,
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
index 92f44f83a9a5..9ff12163b12a 100644
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -257,7 +257,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         image_token_id: Optional[int] = None,
         video_token_id: Optional[int] = None,
         **kwargs,
diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py
index dc0923801243..05a9a58089dd 100644
--- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py
+++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py
@@ -242,7 +242,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = True,
         attention_dropout: Optional[float] = 0.0,
         moe_intermediate_size: Optional[int] = 1408,
diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
index 62b5fee670df..9e7557c9ecf5 100644
--- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
+++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
@@ -183,7 +183,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = True,
         attention_dropout: Optional[float] = 0.0,
         moe_intermediate_size: Optional[int] = 1408,
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 1a2dafdf2668..744e0316146c 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -131,7 +131,7 @@ def __init__(
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
         use_parallel_residual: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = True,
         **kwargs,
     ):
diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
index f09bc8810da0..409232145f2a 100644
--- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -100,7 +100,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         bos_token_id: Optional[int] = 31996,
         eos_token_id: Optional[int] = 31999,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_dropout: Optional[float] = 0.1,
         hidden_dropout: Optional[float] = 0.0,
         **kwargs,
diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py
index 65c04c3a67e1..97d3eca0aafe 100644
--- a/src/transformers/models/granite/configuration_granite.py
+++ b/src/transformers/models/granite/configuration_granite.py
@@ -141,7 +141,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py
index f1263f080630..98460ec8a363 100644
--- a/src/transformers/models/granitemoe/configuration_granitemoe.py
+++ b/src/transformers/models/granitemoe/configuration_granitemoe.py
@@ -130,7 +130,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         embedding_multiplier: Optional[float] = 1.0,
diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
index 55e1546fa435..9a58272ec428 100644
--- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
@@ -147,7 +147,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         embedding_multiplier: Optional[float] = 1.0,
diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
index 00f87604bf51..b94545710e35 100644
--- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
@@ -132,7 +132,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         embedding_multiplier: Optional[float] = 1.0,
diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py
index db7ccaf185ae..3f3ee841991f 100644
--- a/src/transformers/models/helium/configuration_helium.py
+++ b/src/transformers/models/helium/configuration_helium.py
@@ -124,7 +124,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-8,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         pad_token_id: Optional[int] = 3,
         eos_token_id: Optional[int] = 2,
         bos_token_id: Optional[int] = 1,
diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py
index 29dd3ac34f98..3dfa5388d1f7 100644
--- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py
+++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py
@@ -116,7 +116,7 @@ def __init__(
         eod_token_id: Optional[int] = 3,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         head_dim: Optional[int] = None,
diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
index 497a5674f4f3..5ee86b218ae0 100644
--- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
+++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
@@ -127,7 +127,7 @@ def __init__(
         sep_token_id: Optional[int] = 4,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         num_experts: Union[int, list] = 1,
diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py
index 9f5367d1c01c..43a7b069a32e 100644
--- a/src/transformers/models/jetmoe/configuration_jetmoe.py
+++ b/src/transformers/models/jetmoe/configuration_jetmoe.py
@@ -119,7 +119,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         rms_norm_eps: Optional[int] = 1e-6,
         initializer_range: Optional[float] = 0.01,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
index d43856daa96a..05c901d96dd4 100644
--- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
@@ -129,7 +129,7 @@ def __init__(
         num_attention_heads: Optional[int] = 32,
         num_key_value_heads: Optional[int] = None,
         max_position_embeddings: Optional[int] = 750,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         hidden_act: Optional[str] = "silu",
         head_dim: Optional[int] = None,
         initializer_range: Optional[float] = 0.02,
diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py
index 4999f6ab433f..6ee32698cc85 100644
--- a/src/transformers/models/lfm2/configuration_lfm2.py
+++ b/src/transformers/models/lfm2/configuration_lfm2.py
@@ -117,7 +117,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         conv_bias: Optional[bool] = False,
         conv_L_cache: Optional[int] = 3,
         block_multiple_of: Optional[int] = 256,
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 3b2543983e06..add6c8ee2f74 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -143,7 +143,7 @@ def __init__(
         eos_token_id: Optional[int] = 2,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py
index 7d457cf8523c..a37301a17741 100644
--- a/src/transformers/models/llama4/configuration_llama4.py
+++ b/src/transformers/models/llama4/configuration_llama4.py
@@ -104,7 +104,7 @@ def __init__(
         multi_modal_projector_bias: Optional[bool] = False,
         projector_dropout: Optional[float] = 0.0,
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         self.hidden_size = hidden_size
@@ -290,7 +290,7 @@ def __init__(
         output_router_logits=False,
         router_aux_loss_coef=0.001,
         router_jitter_noise=0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         no_rope_layers=None,
         no_rope_layer_interval=4,
         attention_chunk_size=8192,
diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py
index 7933cb5bb0dc..6163a0cad785 100644
--- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py
+++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py
@@ -157,7 +157,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         ffn_hidden_size: Optional[int] = 12288,
diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
index 733221273016..5453817e3ea4 100644
--- a/src/transformers/models/mimi/configuration_mimi.py
+++ b/src/transformers/models/mimi/configuration_mimi.py
@@ -179,7 +179,7 @@ def __init__(
         norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = False,
         use_streaming: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         sliding_window: Optional[int] = 250,
         attention_dropout: Optional[float] = 0.0,
         layer_scale_initial_scale: Optional[float] = 0.01,
diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py
index 8c4737cc5b67..b99a61a277ea 100644
--- a/src/transformers/models/minimax/configuration_minimax.py
+++ b/src/transformers/models/minimax/configuration_minimax.py
@@ -176,7 +176,7 @@ def __init__(
         output_router_logits: Optional[bool] = False,
         router_aux_loss_coef: Optional[float] = 0.001,
         router_jitter_noise: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         layer_types: Optional[list[str]] = None,
         block_size: Optional[int] = 256,
         full_attn_alpha_factor: Optional[int] = 1,
diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py
index 50a42c9d5cec..d1bbb96bb5c1 100644
--- a/src/transformers/models/minimax/modular_minimax.py
+++ b/src/transformers/models/minimax/modular_minimax.py
@@ -200,7 +200,7 @@ def __init__(
         output_router_logits: Optional[bool] = False,
         router_aux_loss_coef: Optional[float] = 0.001,
         router_jitter_noise: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         layer_types: Optional[list[str]] = None,
         block_size: Optional[int] = 256,
         full_attn_alpha_factor: Optional[int] = 1,
diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py
index e17bd2a65423..0fac55d26e2a 100644
--- a/src/transformers/models/mistral/configuration_mistral.py
+++ b/src/transformers/models/mistral/configuration_mistral.py
@@ -136,7 +136,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         sliding_window: Optional[int] = 4096,
         attention_dropout: Optional[float] = 0.0,
         **kwargs,
diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py
index 9a8e2280c252..6784b7eb5f19 100644
--- a/src/transformers/models/mixtral/configuration_mixtral.py
+++ b/src/transformers/models/mixtral/configuration_mixtral.py
@@ -158,7 +158,7 @@ def __init__(
         output_router_logits: Optional[bool] = False,
         router_aux_loss_coef: Optional[float] = 0.001,
         router_jitter_noise: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py
index 6d378425284d..b3a045ae324a 100644
--- a/src/transformers/models/modernbert/configuration_modernbert.py
+++ b/src/transformers/models/modernbert/configuration_modernbert.py
@@ -154,7 +154,7 @@ def __init__(
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         layer_types: Optional[list[str]] = None,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         local_attention: Optional[int] = 128,
         embedding_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index 6dd1a547a320..131a01e6db5c 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -181,7 +181,7 @@ def __init__(
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         layer_types: Optional[list[str]] = None,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         local_attention: Optional[int] = 128,
         embedding_dropout: Optional[float] = 0.0,
         mlp_bias: Optional[bool] = False,
diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py
index cc17f6ce6711..be60950fa593 100644
--- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py
+++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py
@@ -154,7 +154,7 @@ def __init__(
         local_attention: Optional[int] = 128,
         global_attn_every_n_layers: Optional[int] = 3,
         layer_types: Optional[list[str]] = None,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         super().__init__(
diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
index ffa7da7c130a..e7935b9f2159 100644
--- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
+++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
@@ -174,7 +174,7 @@ def __init__(
         local_attention: Optional[int] = 128,
         global_attn_every_n_layers: Optional[int] = 3,
         layer_types: Optional[list[str]] = None,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         super().__init__(
diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py
index 5237cd4e3d8c..e04909e1f7eb 100644
--- a/src/transformers/models/moonshine/configuration_moonshine.py
+++ b/src/transformers/models/moonshine/configuration_moonshine.py
@@ -141,7 +141,7 @@ def __init__(
         initializer_range: Optional[float] = 0.02,
         decoder_start_token_id: Optional[int] = 1,
         use_cache: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 0.9,
         is_encoder_decoder: Optional[bool] = True,
         attention_bias: Optional[bool] = False,
diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py
index 1e035bdb87c6..bb66a7916f00 100644
--- a/src/transformers/models/moonshine/modular_moonshine.py
+++ b/src/transformers/models/moonshine/modular_moonshine.py
@@ -164,7 +164,7 @@ def __init__(
         initializer_range: Optional[float] = 0.02,
         decoder_start_token_id: Optional[int] = 1,
         use_cache: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 0.9,
         is_encoder_decoder: Optional[bool] = True,
         attention_bias: Optional[bool] = False,
diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py
index 8d2e2aef339a..fea1a7cff985 100644
--- a/src/transformers/models/moshi/configuration_moshi.py
+++ b/src/transformers/models/moshi/configuration_moshi.py
@@ -252,7 +252,7 @@ def __init__(
         num_key_value_heads: Optional[int] = None,
         audio_vocab_size: Optional[int] = None,
         max_position_embeddings: Optional[int] = 3000,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         hidden_act: Optional[str] = "silu",
         head_dim: Optional[int] = None,
         initializer_range: Optional[float] = 0.02,
diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py
index 084a674fc345..c5f888ac6d36 100644
--- a/src/transformers/models/nemotron/configuration_nemotron.py
+++ b/src/transformers/models/nemotron/configuration_nemotron.py
@@ -122,7 +122,7 @@ def __init__(
         bos_token_id: Optional[int] = 2,
         eos_token_id: Optional[int] = 3,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 0.5,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py
index 6a1fb4f96526..f01e33ead00a 100644
--- a/src/transformers/models/olmo/configuration_olmo.py
+++ b/src/transformers/models/olmo/configuration_olmo.py
@@ -134,7 +134,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[int] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         clip_qkv: Optional[bool] = None,
diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py
index 2b4af4c7523c..3ba97d4f162b 100644
--- a/src/transformers/models/olmo2/configuration_olmo2.py
+++ b/src/transformers/models/olmo2/configuration_olmo2.py
@@ -135,7 +135,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         rms_norm_eps: Optional[int] = 1e-5,
diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py
index 74eddd2d5af4..12705dce6e8c 100644
--- a/src/transformers/models/olmo2/modular_olmo2.py
+++ b/src/transformers/models/olmo2/modular_olmo2.py
@@ -150,7 +150,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         rms_norm_eps: Optional[int] = 1e-5,
diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py
index 08762d09ff61..6e3f5594cbb5 100644
--- a/src/transformers/models/olmo3/configuration_olmo3.py
+++ b/src/transformers/models/olmo3/configuration_olmo3.py
@@ -135,7 +135,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         rms_norm_eps: Optional[float] = 1e-5,
diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py
index ab1b63752721..d8bec6e9f15d 100644
--- a/src/transformers/models/olmo3/modular_olmo3.py
+++ b/src/transformers/models/olmo3/modular_olmo3.py
@@ -151,7 +151,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         rms_norm_eps: Optional[float] = 1e-5,
diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py
index 5dae49098a29..511d7968fb78 100644
--- a/src/transformers/models/olmoe/configuration_olmoe.py
+++ b/src/transformers/models/olmoe/configuration_olmoe.py
@@ -122,7 +122,7 @@ def __init__(
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = 50279,
         tie_word_embeddings: Optional[int] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         clip_qkv: Optional[bool] = None,
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
index 5c2452526635..f9dbe11580b2 100644
--- a/src/transformers/models/persimmon/configuration_persimmon.py
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -98,7 +98,7 @@ def __init__(
         layer_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         qk_layernorm: Optional[bool] = True,
         hidden_dropout: Optional[float] = 0.0,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py
index 427b453db981..5476cb1b6c7c 100644
--- a/src/transformers/models/phi/configuration_phi.py
+++ b/src/transformers/models/phi/configuration_phi.py
@@ -137,7 +137,7 @@ def __init__(
         layer_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 0.5,
         qk_layernorm: Optional[bool] = False,
         bos_token_id: Optional[int] = 1,
diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py
index ed096dd8a319..35eb2df30c9d 100644
--- a/src/transformers/models/phi3/configuration_phi3.py
+++ b/src/transformers/models/phi3/configuration_phi3.py
@@ -139,7 +139,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 1.0,
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 32000,
diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
index c9ea706b2c4c..46c104d027a7 100644
--- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
@@ -366,7 +366,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[int] = 1,
         bos_token_id: Optional[int] = 199999,
         eos_token_id: Optional[list[int]] = [199999, 200020],
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
index 17458f141f12..9095c4375c7e 100644
--- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -388,7 +388,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[int] = 1,
         bos_token_id: Optional[int] = 199999,
         eos_token_id: Optional[list[int]] = [199999, 200020],
diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py
index 8af5508daf93..f7a9b528211f 100644
--- a/src/transformers/models/phimoe/configuration_phimoe.py
+++ b/src/transformers/models/phimoe/configuration_phimoe.py
@@ -128,7 +128,7 @@ def __init__(
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[int] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[float] = 0.0,
         num_experts_per_tok: Optional[int] = 2,
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 3e2098adcd94..62c179b20edc 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -86,7 +86,7 @@ def __init__(
         patch_size: Optional[int] = 16,
         hidden_act: Optional[str] = "gelu",
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         initializer_range: Optional[float] = 0.02,
         **kwargs,
     ):
diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py
index 418b18027350..bda8bb8abfc7 100644
--- a/src/transformers/models/qwen2/configuration_qwen2.py
+++ b/src/transformers/models/qwen2/configuration_qwen2.py
@@ -129,7 +129,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 4096,
         max_window_layers: Optional[int] = 28,
diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
index 69a4e3e0c66f..af96e9a3163f 100644
--- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py
@@ -322,7 +322,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 32768,
         max_window_layers: Optional[int] = 28,
@@ -650,7 +650,7 @@ def __init__(
         sliding_window=32768,
         max_window_layers=28,
         attention_dropout=0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         position_id_per_seconds=25,
         seconds_per_chunk=2,
         audio_start_token_id=151647,
@@ -781,7 +781,7 @@ def __init__(
         ff_mult=2,
         emb_dim=512,
         head_dim=64,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         max_position_embeddings=32768,
         block_size=24,
         look_ahead_layers=[10],
diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
index c37b321ce38b..329e1b798dd6 100644
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -355,7 +355,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 32768,
         max_window_layers: Optional[int] = 28,
@@ -683,7 +683,7 @@ def __init__(
         sliding_window=32768,
         max_window_layers=28,
         attention_dropout=0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         position_id_per_seconds=25,
         seconds_per_chunk=2,
         audio_start_token_id=151647,
@@ -814,7 +814,7 @@ def __init__(
         ff_mult=2,
         emb_dim=512,
         head_dim=64,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         max_position_embeddings=32768,
         block_size=24,
         look_ahead_layers=[10],
diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
index 41ae4195f63d..5469b3226f3b 100644
--- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
@@ -179,7 +179,7 @@ def __init__(
         max_window_layers: Optional[int] = 80,
         layer_types: Optional[list[str]] = None,
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
index f45577e91516..256d663d3114 100644
--- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
@@ -150,7 +150,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 4096,
         max_window_layers: Optional[int] = 28,
diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
index 02eee9de1ddb..62fb4a815a4c 100644
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -167,7 +167,7 @@ def __init__(
         max_window_layers: Optional[int] = 80,
         layer_types: Optional[list[str]] = None,
         attention_dropout: Optional[float] = 0.0,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py
index 90edaff6aaa3..a1cf6a1ea861 100644
--- a/src/transformers/models/qwen3/configuration_qwen3.py
+++ b/src/transformers/models/qwen3/configuration_qwen3.py
@@ -134,7 +134,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 4096,
diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
index e5003c509118..5043a3f38a07 100644
--- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
@@ -148,7 +148,7 @@ def __init__(
         rms_norm_eps: Optional[int] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = 4096,
diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py
index da6dde8c9db7..1e5df811d866 100644
--- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py
+++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py
@@ -165,7 +165,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         partial_rotary_factor: Optional[float] = 0.25,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
index 281c2a2bf509..4c2b86d4da20 100644
--- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
@@ -295,7 +295,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[int] = 0,
@@ -738,7 +738,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 0.000001,
         use_cache: Optional[int] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[int] = 0,
diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
index 526169066dc1..7b96007aa6ef 100644
--- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
@@ -169,7 +169,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[int] = 0,
@@ -380,7 +380,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 0.000001,
         use_cache: Optional[int] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[int] = 0,
diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py
index f4228ddb3f87..546a3da5bb7b 100644
--- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py
@@ -146,7 +146,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         **kwargs,
diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
index 9216b9951398..7758a23e2970 100644
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@@ -187,7 +187,7 @@ def __init__(
         rms_norm_eps: Optional[float] = 1e-6,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         **kwargs,
diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
index 3a3aca4ddacd..130044ee099d 100644
--- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
@@ -120,7 +120,7 @@ def __init__(
         bos_token_id: Optional[int] = 2,
         hidden_activation: Optional[str] = "gelu_pytorch_tanh",
         partial_rotary_factor: Optional[float] = 0.5,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"),
         attention_dropout: Optional[float] = 0.0,
         num_key_value_heads: Optional[int] = None,
diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py
index 7961646ae2d8..240cb03bac77 100644
--- a/src/transformers/models/seed_oss/configuration_seed_oss.py
+++ b/src/transformers/models/seed_oss/configuration_seed_oss.py
@@ -139,7 +139,7 @@ def __init__(
         eos_token_id: Optional[int] = 2,
         pretraining_tp: Optional[int] = 1,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = True,
         attention_out_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.1,
diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py
index 2ffdf53008c6..04e8e78e575c 100644
--- a/src/transformers/models/smollm3/configuration_smollm3.py
+++ b/src/transformers/models/smollm3/configuration_smollm3.py
@@ -140,7 +140,7 @@ def __init__(
         pad_token_id: Optional[int] = 128004,
         bos_token_id: Optional[int] = 128000,
         eos_token_id: Optional[int] = 128001,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         no_rope_layers: Optional[int] = None,
diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py
index fe8bcb52080d..e5551d414c1b 100644
--- a/src/transformers/models/smollm3/modular_smollm3.py
+++ b/src/transformers/models/smollm3/modular_smollm3.py
@@ -157,7 +157,7 @@ def __init__(
         pad_token_id: Optional[int] = 128004,
         bos_token_id: Optional[int] = 128000,
         eos_token_id: Optional[int] = 128001,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_sliding_window: Optional[bool] = False,
         sliding_window: Optional[int] = None,
         no_rope_layers: Optional[int] = None,
diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py
index 9beed377ad69..0efdcd94adcd 100644
--- a/src/transformers/models/stablelm/configuration_stablelm.py
+++ b/src/transformers/models/stablelm/configuration_stablelm.py
@@ -119,7 +119,7 @@ def __init__(
         layer_norm_eps: Optional[float] = 1.0e-5,
         use_cache: Optional[bool] = True,
         tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         use_qkv_bias: Optional[bool] = False,
         qk_layernorm: Optional[bool] = False,
         use_parallel_residual: Optional[bool] = False,
diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py
index 9d87dd6eefa1..cb34ad1d9157 100644
--- a/src/transformers/models/starcoder2/configuration_starcoder2.py
+++ b/src/transformers/models/starcoder2/configuration_starcoder2.py
@@ -131,7 +131,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         bos_token_id: Optional[int] = 50256,
         eos_token_id: Optional[int] = 50256,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         sliding_window: Optional[int] = None,
         attention_dropout: Optional[float] = 0.0,
         residual_dropout: Optional[float] = 0.0,
diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py
index bea8916d3e6b..6e6f9784e951 100644
--- a/src/transformers/models/t5gemma/configuration_t5gemma.py
+++ b/src/transformers/models/t5gemma/configuration_t5gemma.py
@@ -140,7 +140,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
index 6d49e5c241ad..86ecf53ae6e4 100644
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -159,7 +159,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py
index d50cf5ed93d7..0a784c02c1e6 100644
--- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py
+++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py
@@ -140,7 +140,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/vaultgemma/modular_vaultgemma.py b/src/transformers/models/vaultgemma/modular_vaultgemma.py
index e8b3a4ee6773..a0a9fc207692 100644
--- a/src/transformers/models/vaultgemma/modular_vaultgemma.py
+++ b/src/transformers/models/vaultgemma/modular_vaultgemma.py
@@ -121,7 +121,7 @@ def __init__(
         eos_token_id: Optional[int] = 1,
         bos_token_id: Optional[int] = 2,
         tie_word_embeddings: Optional[bool] = True,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         attention_bias: Optional[bool] = False,
         attention_dropout: Optional[float] = 0.0,
         query_pre_attn_scalar: Optional[int] = 256,
diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py
index 40e30822cf59..4d6c92439da5 100644
--- a/src/transformers/models/zamba2/configuration_zamba2.py
+++ b/src/transformers/models/zamba2/configuration_zamba2.py
@@ -162,7 +162,7 @@ def __init__(
         use_shared_attention_adapter: Optional[bool] = False,
         adapter_rank: Optional[int] = 128,
         use_mem_rope: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
         initializer_range: Optional[float] = 0.02,
         rms_norm_eps: Optional[int] = 1e-5,
         use_cache: Optional[bool] = True,

From 98287d9b7aa299a1df19d3882765094798ff6995 Mon Sep 17 00:00:00 2001
From: Luc Georges <McPatate@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:19:30 +0100
Subject: [PATCH 29/56] docs: add continuous batching page (#41847)

* docs: add continuous batching page

* docs(cb): add `generate_batch` example

* docs(cb): add `opentelemtry` and `serving` section

* feat: add `TODO` note about opentelemetry dependency

* docs(cb): add supported features

* docs(cb): add unsupported features

* docs(cb): add `ContinuousBatchingManager` example

* docs(cb): x reference CB in optimizing inference
---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/continuous_batching.md         | 194 ++++++++++++++++++
 docs/source/en/llm_optims.md                  |   6 +
 setup.py                                      |   1 +
 .../continuous_batching/continuous_api.py     |   1 +
 5 files changed, 204 insertions(+)
 create mode 100644 docs/source/en/continuous_batching.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0198cdd33711..c5ce9fbdb9c4 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -119,6 +119,8 @@
     title: Tools
   - local: transformers_as_backend
     title: Inference server backends
+  - local: continuous_batching
+    title: Continuous Batching
   title: Inference
 - isExpanded: false
   sections:
diff --git a/docs/source/en/continuous_batching.md b/docs/source/en/continuous_batching.md
new file mode 100644
index 000000000000..06158a302cbb
--- /dev/null
+++ b/docs/source/en/continuous_batching.md
@@ -0,0 +1,194 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Continuous Batching
+
+Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
+
+We are particularly interested in having Continuous Batching in transformers for the following use cases:
+- Evaluation of models on large datasets with variable-length inputs
+- Generating outputs for multiple sequences for GRPO policies
+
+CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
+
+If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
+
+## API Reference
+
+## Usage Examples
+
+The main way to use CB in transformers is via the `generate_batch` method.
+
+Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 
+
+For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
+
+### `generate_batch` example
+
+We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
+
+This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
+
+You can use it as follows:
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+generation_config = GenerationConfig(
+    max_new_tokens=32,
+    use_cuda_graph=False,  # Not supported for simple version
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.pad_token_id,
+    do_sample=False,
+    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
+)
+
+batch_outputs = model.generate_batch(
+    inputs=simple_batch_inputs,
+    generation_config=generation_config,
+)
+
+for request_id, output in batch_outputs.items():
+    generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
+    print(f"Request {request_id} output: {generated_text}")
+```
+
+### `ContinuousBatchingManager` example
+
+If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
+
+This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
+
+Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
+
+Note that the manager is thread safe!
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from transformers.generation.continuous_batching import RequestStatus
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+# initialize the manager, available method thanks to the `ContinuousMixin`
+manager = model.init_continuous_batching(generation_config=generation_config)
+
+# start the background thread
+manager.start()
+
+# this is for demonstration purposes only, in practice this is most useful to do concurrently
+for i, input in enumerate(simple_batch_inputs):
+    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
+
+# Can be done in an other thread
+for id, request in manager.get_result():
+    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
+    print(f"Request {id} output: {generated_text}")
+
+# you can also get results for a specific request id
+result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
+
+# or get results for a request that is streaming
+manager.add_request(
+    input_ids=input,
+    request_id="streaming_request",
+    stream=True,
+)
+for chunk in manager.request_id_iter(request_id="streaming_request"):
+    generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
+    print(generated_text)
+    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
+    if chunk.status == RequestStatus.FINISHED:
+        break
+
+# stop the background thread before exiting the process
+manager.stop()
+```
+
+## Supported & Unsupported Features
+
+### Supported Features
+
+- Dynamic scheduling of variable-length requests
+- Chunked prefill
+- Paged Attention Cache
+- Sliding window attention
+- Chat templates
+
+### Unsupported Features
+
+At the moment, the following features are not supported with CB. We plan to add support to the following:
+
+- Prefix caching
+- Beam search
+- tool calling
+
+The others are unplanned, but depending on community requests we might consider adding them:
+
+- MTP (multi token prediction)
+- Medusa
+
+## Performance Considerations
+
+
+## Integration with Serving
+
+You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
+
+## Monitoring
+
+We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
+
+```sh
+# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
+pip install transformers[open-telemetry]
+```
+
+This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
+
diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md
index 92961d2de5ef..e7cadd439060 100644
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -393,3 +393,9 @@ model = AutoModelForCausalLM.from_pretrained(
     "mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
 )
 ```
+
+## Continuous Batching
+
+When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
+
+See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.
diff --git a/setup.py b/setup.py
index d669b72a366b..a71582af051d 100644
--- a/setup.py
+++ b/setup.py
@@ -392,6 +392,7 @@ def run(self):
 extras["benchmark"] = deps_list("optimum-benchmark")
 
 # OpenTelemetry dependencies for metrics collection in continuous batching
+# TODO: refactor this to split API and SDK; SDK and exporter should only be needed to run code that collects metrics whereas API is what people will need to instrument their code and handle exporter themselves
 extras["open-telemetry"] = deps_list("opentelemetry-api") + ["opentelemetry-exporter-otlp", "opentelemetry-sdk"]
 
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
index 21fc006e2f40..cf33290d239f 100644
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -919,6 +919,7 @@ def __iter__(self):
             if result is not None:
                 yield result
 
+    # FIXME: stop iteration when request status is finished?
     def request_id_iter(self, request_id: str) -> Generator[GenerationOutput]:
         """Iterate over results matching a specific request id as they become available."""
         request_cancelled = False

From 85b0bd9f1538aa7a0dc440bb6c63e54f8d476920 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:27:47 +0100
Subject: [PATCH 30/56] Fix `torchcodec` version in quantization docker file
 (#41988)

check

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-quantization-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index c59de3916267..723e1bab07be 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 

From e798fe4cbb937a7bf54b9a4f5fd16b263f8a531b Mon Sep 17 00:00:00 2001
From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:36:52 +0100
Subject: [PATCH 31/56] =?UTF-8?q?[kernels]=C2=A0Add=20Tests=20&=20CI=20for?=
 =?UTF-8?q?=20kernels=20(#41765)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* first commit

* add tests

* add kernel config

* add more tests

* add ci

* small fix

* change branch name

* update tests

* nit

* change test name

* revert jobs

* addressing review

* reenable all jobs

* address second review
---
 .github/workflows/self-scheduled-caller.yml  |  12 +
 .github/workflows/self-scheduled.yml         |  65 +++
 src/transformers/integrations/hub_kernels.py |  11 +-
 tests/kernels/test_kernels.py                | 403 +++++++++++++++++++
 utils/notification_service.py                |   4 +
 5 files changed, 491 insertions(+), 4 deletions(-)
 create mode 100644 tests/kernels/test_kernels.py

diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml
index fea2d9ef66c8..65ba9e3ce32f 100644
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@@ -118,3 +118,15 @@ jobs:
       report_repo_id: hf-internal-testing/transformers_daily_ci
       commit_sha: ${{ github.sha }}
     secrets: inherit
+
+  kernels-ci:
+    name: Kernels CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_kernels_gpu
+      slack_report_channel: "#transformers-ci-daily-kernels"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index d3de9b70e87c..ccf6aa962915 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -475,6 +475,70 @@ jobs:
           name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
           path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
 
+  run_kernels_gpu:
+    if: ${{ inputs.job == 'run_kernels_gpu' }}
+    name: Kernel tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+
+      - name: Reinstall transformers in edit mode
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
+  
+      - name: Install kernels
+        working-directory: /transformers
+        run: python3 -m pip install -U kernels
+  
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+    
+      - name: Run kernel tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_kernels_gpu_test_reports tests/kernels/test_kernels.py
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
+
   run_extract_warnings:
     # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
     if: ${{ always() && inputs.job == 'run_models_gpu' }}
@@ -527,6 +591,7 @@ jobs:
       run_examples_gpu,
       run_torch_cuda_extensions_gpu,
       run_quantization_torch_gpu,
+      run_kernels_gpu,
       run_extract_warnings
     ]
     if: always() && !cancelled()
diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
index 95ca49a74915..a64e156bacf4 100644
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@@ -51,10 +51,13 @@
             )
         },
         "RMSNorm": {
-            "cuda": LayerRepository(
-                repo_id="kernels-community/liger_kernels",
-                layer_name="LigerRMSNorm",
-            ),
+            "cuda": {
+                Mode.INFERENCE: LayerRepository(
+                    repo_id="kernels-community/liger_kernels",
+                    layer_name="LigerRMSNorm",
+                    # revision="pure-layer-test",
+                ),
+            },
             "rocm": {
                 Mode.INFERENCE: LayerRepository(
                     repo_id="kernels-community/liger_kernels",
diff --git a/tests/kernels/test_kernels.py b/tests/kernels/test_kernels.py
new file mode 100644
index 000000000000..6311629ac4f2
--- /dev/null
+++ b/tests/kernels/test_kernels.py
@@ -0,0 +1,403 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run the test: CUDA_VISIBLE_DEVICES=0 RUN_SLOW=1 pytest -sv tests/kernels/test_kernels.py
+
+
+import copy
+import types
+from unittest.mock import patch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, KernelConfig
+from transformers.integrations.hub_kernels import (
+    _HUB_KERNEL_MAPPING,
+    _KERNEL_MODULE_MAPPING,
+    is_kernel,
+    lazy_load_kernel,
+    load_and_register_attn_kernel,
+)
+from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.testing_utils import (
+    TestCasePlus,
+    cleanup,
+    require_kernels,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.utils.import_utils import is_kernels_available
+
+
+if is_kernels_available():
+    import kernels as kernels_pkg
+    from kernels import Device, Mode, kernelize
+
+
+@require_kernels
+@slow
+class TestHubKernels(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_id = "unsloth/Llama-3.2-1B-Instruct"
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_id)
+        cls.model_kernelized = AutoModelForCausalLM.from_pretrained(
+            cls.model_id, use_kernels=True, device_map=torch_device
+        )
+        cls.model_not_kernelized = AutoModelForCausalLM.from_pretrained(
+            cls.model_id, use_kernels=False, device_map=torch_device
+        )
+        cls.input = "Hello"
+
+    @classmethod
+    def tearDownClass(cls):
+        for attr in [
+            "model_kernelized",
+            "model_not_kernelized",
+            "tokenizer",
+        ]:
+            if hasattr(cls, attr):
+                try:
+                    delattr(cls, attr)
+                except Exception:
+                    pass
+
+        # Clear any temporary kernel module cache entries populated by tests
+        try:
+            keys_to_remove = [
+                k for k, v in list(_KERNEL_MODULE_MAPPING.items()) if v is None or isinstance(v, types.ModuleType)
+            ]
+            for k in keys_to_remove:
+                _KERNEL_MODULE_MAPPING.pop(k, None)
+        except Exception:
+            pass
+
+    def tearDown(self):
+        # Free accelerator memory/cache and trigger GC
+        cleanup(torch_device, gc_collect=True)
+
+    @require_torch_accelerator
+    def test_forward(self):
+        tokenized_input = self.tokenizer(self.input, return_tensors="pt").input_ids.to(self.model_kernelized.device)
+        output_ = self.model_kernelized.generate(tokenized_input, max_new_tokens=10, do_sample=False)
+        output = self.tokenizer.decode(output_[0], skip_special_tokens=True)
+
+        self.EXPECTED_OUTPUT = set()
+        self.EXPECTED_OUTPUT.add("Hello, I'm looking for a reliable and trustworthy online")
+
+        self.assertTrue(output in self.EXPECTED_OUTPUT)
+
+    def test_getter_use_kernels(self):
+        self.assertTrue(self.model_kernelized.use_kernels)
+        self.assertFalse(self.model_not_kernelized.use_kernels)
+
+    def assert_kernelized_forward_is_different(self, kernelized_model, not_kernelized_model):
+        """
+        Iterate over modules and check if the forward method is different between
+        the kernelized and not kernelized models. Break on first difference, else continue.
+        Finally, assert that at least one forward is different.
+        """
+        found_difference = False
+        for (name1, module1), (name2, module2) in zip(
+            kernelized_model.named_modules(), not_kernelized_model.named_modules()
+        ):
+            # Only compare modules with the same name
+            if name1 != name2:
+                continue
+            # Check if both modules have a 'forward' attribute
+            if hasattr(module1, "forward") and hasattr(module2, "forward"):
+                # Compare the code objects of the forward methods
+                code1 = getattr(module1.forward, "__code__", None)
+                code2 = getattr(module2.forward, "__code__", None)
+                if code1 is not None and code2 is not None:
+                    if code1 is not code2:
+                        found_difference = True
+                        break
+        self.assertTrue(
+            found_difference,
+            "No module's forward method was different between kernelized and not kernelized models.",
+        )
+
+    def assert_kernelized_forward_is_the_same(self, model_1, model_2):
+        """
+        Iterate over modules and check if the forward method is the same between
+        the kernelized and not kernelized models. Break on first difference, else continue.
+        Finally, assert that at least one forward is the same.
+        """
+        no_difference = True
+        for (name1, module1), (name2, module2) in zip(model_1.named_modules(), model_2.named_modules()):
+            # Only compare modules with the same name
+            if name1 != name2:
+                continue
+            # Check if both modules have a 'forward' attribute
+            if hasattr(module1, "forward") and hasattr(module2, "forward"):
+                # Compare the code objects of the forward methods
+                code1 = getattr(module1.forward, "__code__", None)
+                code2 = getattr(module2.forward, "__code__", None)
+                if code1 is not None and code2 is not None:
+                    if code1 != code2:
+                        no_difference = False
+                        break
+        self.assertTrue(
+            no_difference,
+            "All module's forward methods were the same between the two models",
+        )
+
+    def test_kernelize(self):
+        model = copy.deepcopy(self.model_not_kernelized)
+        kernelize(model, mode=Mode.INFERENCE, device=Device(type=model.device.type))  # type: ignore[arg-type]
+        self.assert_kernelized_forward_is_different(model, self.model_not_kernelized)
+        self.assert_kernelized_forward_is_the_same(model, self.model_kernelized)
+        del model
+
+    def test_setter_use_kernels(self):
+        model = copy.deepcopy(self.model_not_kernelized)
+        model.use_kernels = True
+        self.assertTrue(model.use_kernels)
+        self.assert_kernelized_forward_is_different(model, self.model_not_kernelized)
+        self.assert_kernelized_forward_is_the_same(model, self.model_kernelized)
+        del model
+
+    def test_unkernelize(self):
+        model = copy.deepcopy(self.model_kernelized)
+
+        with self.assertLogs("transformers.modeling_utils", level="WARNING") as cm:
+            model.use_kernels = False
+
+        self.assertTrue(
+            any(
+                "Disabling kernels at runtime is a no-op as there is no 'unkernelize' routine; keeping current kernels active."
+                in msg
+                for msg in cm.output
+            )
+        )
+
+        self.assertFalse(model.use_kernels)
+        del model
+
+    def test_kernels_mapping(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm": "kernels-community/layer_norm:LlamaRMSNorm"})
+        model = AutoModelForCausalLM.from_pretrained(
+            "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+        )
+
+        EXPECTED_OUTPUT = set()
+        EXPECTED_OUTPUT.add("Hello, I'm looking for a reliable and trustworthy online")
+
+        tokenized_input = self.tokenizer(self.input, return_tensors="pt").input_ids.to(model.device)
+        output = model.generate(tokenized_input, max_new_tokens=10, do_sample=False)
+        output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        self.assertTrue(output in EXPECTED_OUTPUT)
+
+        del model
+
+    def test_faulty_kernel_mapping_layer_name(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm1": "kernels-community/layer_norm:LlamaRMSNorm"})
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+            )
+
+    def test_faulty_kernel_mapping_type(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm": 1})
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+            )
+
+
+@require_kernels
+class TestKernelUtilities(TestCasePlus):
+    def test_is_kernel_regex(self):
+        valid = [
+            "org/model",
+            "org/model@main",
+            "org/model:my_func",
+            "org/model@v1.2.3:my_func",
+            "flash|org/model@rev:fn",
+        ]
+        invalid = [
+            "org//model",
+            "org/model:too:many",
+            "org/model@rev:fn:extra",
+            "/org/model",
+            "org:model",
+        ]
+        for s in valid:
+            self.assertTrue(is_kernel(s.split("|")[-1]))
+        for s in invalid:
+            self.assertFalse(is_kernel(s))
+
+    def test_lazy_load_kernel_success_and_cache(self):
+        sentinel = types.SimpleNamespace(name="sentinel")
+
+        original_get_kernel = getattr(kernels_pkg, "get_kernel")
+        try:
+
+            def fake_get_kernel(repo_id, revision=None, version=None):
+                self.assertIn(repo_id, {"kernels-community/causal-conv1d"})
+                return sentinel
+
+            setattr(kernels_pkg, "get_kernel", fake_get_kernel)
+            _KERNEL_MODULE_MAPPING.pop("causal-conv1d", None)
+
+            mod1 = lazy_load_kernel("causal-conv1d")
+            self.assertIs(mod1, sentinel)
+            mod2 = lazy_load_kernel("causal-conv1d")
+            self.assertIs(mod2, sentinel)
+        finally:
+            setattr(kernels_pkg, "get_kernel", original_get_kernel)
+            # Ensure cache is cleared to avoid holding onto module references across tests
+            _KERNEL_MODULE_MAPPING.pop("causal-conv1d", None)
+
+    def test_lazy_load_kernel_unknown(self):
+        name = "unknown-kernel-name"
+        _KERNEL_MODULE_MAPPING.pop(name, None)
+        mod = lazy_load_kernel(name)
+        self.assertIsNone(mod)
+        self.assertIn(name, _KERNEL_MODULE_MAPPING)
+        # Cleanup cache entry to avoid growth across tests
+        _KERNEL_MODULE_MAPPING.pop(name, None)
+
+    def test_lazy_load_kernel_version(self):
+        HUB = _HUB_KERNEL_MAPPING
+        name = "causal-conv1d"
+        version_spec = ">=0.0.4,<0.1.0"
+        original_get_kernel = getattr(kernels_pkg, "get_kernel")
+        original_entry = HUB.get(name, None)
+
+        # Use a real ModuleType so caching short-circuits on the second call
+        sentinel_mod = types.ModuleType("sentinel_kernel_module")
+        call_count = {"n": 0}
+
+        try:
+            # Inject dict-style mapping with repo_id and version
+            HUB[name] = {"repo_id": "kernels-community/causal-conv1d", "version": version_spec}  # type: ignore[assignment]
+            _KERNEL_MODULE_MAPPING.pop(name, None)
+
+            def fake_get_kernel(repo_id, revision=None, version=None, user_agent=None):
+                call_count["n"] += 1
+                self.assertEqual(repo_id, "kernels-community/causal-conv1d")
+                self.assertIsNone(revision, "revision must not be set when version is provided")
+                self.assertEqual(version, version_spec)
+                return sentinel_mod
+
+            # Patch kernels.get_kernel so lazy_load_kernel picks it up on import
+            setattr(kernels_pkg, "get_kernel", fake_get_kernel)
+
+            # Act
+            mod1 = lazy_load_kernel(name)
+            mod2 = lazy_load_kernel(name)
+
+            # Assert
+            self.assertIs(mod1, sentinel_mod)
+            self.assertIs(mod2, sentinel_mod)
+            self.assertEqual(call_count["n"], 1, "second call should hit the cache")
+        finally:
+            # Restore patched function and mapping to avoid side effects
+            setattr(kernels_pkg, "get_kernel", original_get_kernel)
+            if original_entry is None:
+                HUB.pop(name, None)
+            else:
+                HUB[name] = original_entry
+            _KERNEL_MODULE_MAPPING.pop(name, None)
+
+
+@require_kernels
+class TestAttentionKernelRegistration(TestCasePlus):
+    def test_load_and_register_flash_attn_like_kernel(self):
+        kernel_obj = types.SimpleNamespace(flash_attn_varlen_func=lambda *a, **k: None)
+
+        with (
+            patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj),
+            patch("transformers.integrations.hub_kernels.lazy_import_flash_attention", return_value=None),
+        ):
+            attn_impl = "org/model"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_ATTENTION_FUNCTIONS.valid_keys())
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception:
+                pass
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception:
+                pass
+
+    def test_load_and_register_named_function_kernel(self):
+        def my_attention(*args, **kwargs):
+            return None
+
+        kernel_obj = types.SimpleNamespace(my_func=my_attention)
+        with patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj):
+            attn_impl = "org/model:my_func"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_ATTENTION_FUNCTIONS.valid_keys())
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception:
+                pass
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception:
+                pass
+
+
+@require_kernels
+class TestUseKernelsLifecycle(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_id = "unsloth/Llama-3.2-1B-Instruct"
+        cls.model = AutoModelForCausalLM.from_pretrained(cls.model_id, use_kernels=False, device_map=torch_device)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Delete large objects to drop references early
+        if hasattr(cls, "model"):
+            try:
+                del cls.model
+            except Exception:
+                pass
+
+    def tearDown(self):
+        # Free accelerator memory/cache and trigger GC
+        cleanup(torch_device, gc_collect=True)
+
+    def test_setting_use_kernels_twice_does_not_rekernelize(self):
+        call_count = {"n": 0}
+
+        def spy_kernelize(*args, **kwargs):
+            call_count["n"] += 1
+
+        with patch.object(kernels_pkg, "kernelize", side_effect=spy_kernelize):
+            self.model.use_kernels = True
+            self.assertTrue(self.model.use_kernels)
+            self.assertEqual(call_count["n"], 1)
+            self.model.use_kernels = True
+            self.assertEqual(call_count["n"], 1)
+
+    def test_train_eval_calls_kernelize_with_correct_mode(self):
+        last_modes = []
+
+        def spy_kernelize(model, device=None, mode=None):
+            last_modes.append(mode)
+
+        with patch.object(kernels_pkg, "kernelize", side_effect=spy_kernelize):
+            self.model.use_kernels = True
+            self.model.train(True)
+            self.assertTrue(any(m == Mode.TRAINING for m in last_modes))
+            self.model.eval()
+            self.assertTrue(any(m == Mode.INFERENCE for m in last_modes))
diff --git a/utils/notification_service.py b/utils/notification_service.py
index be6f488165c4..ca734646b5c2 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -40,6 +40,7 @@
     "run_examples_gpu": "Examples directory",
     "run_torch_cuda_extensions_gpu": "DeepSpeed",
     "run_quantization_torch_gpu": "Quantization",
+    "run_kernels_gpu": "Kernels",
 }
 
 # The values are used as the file names where to save the corresponding CI job results.
@@ -50,6 +51,7 @@
     "Examples directory": "example",
     "DeepSpeed": "deepspeed",
     "Quantization": "quantization",
+    "Kernels": "kernels",
 }
 
 NON_MODEL_TEST_MODULES = [
@@ -65,6 +67,7 @@
     "utils",
     "fsdp",
     "quantization",
+    "kernels",
 ]
 
 
@@ -1301,6 +1304,7 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
         "PyTorch pipelines": "run_pipelines_torch_gpu_test_reports",
         "Examples directory": "run_examples_gpu_test_reports",
         "DeepSpeed": "run_torch_cuda_extensions_gpu_test_reports",
+        "Kernels": "run_kernels_gpu_test_reports",
     }
 
     if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):

From c7a631bf17745daf6eaf3e3b5506bcc54af2fc5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Ouazan?=
 <83456801+remi-or@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:41:06 +0100
Subject: [PATCH 32/56] Move the Mi355 to regular docker (#41989)

* Move the Mi355 to regular docker

* Disable gfx950 compilation for FA on AMD
---
 .github/workflows/self-scheduled-amd-mi355-caller.yml | 6 +++---
 docker/transformers-pytorch-amd-gpu/Dockerfile        | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-mi355-caller.yml b/.github/workflows/self-scheduled-amd-mi355-caller.yml
index 1b5dbe96ad97..07c64bb0b1f1 100644
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@@ -21,7 +21,7 @@ jobs:
       job: run_models_gpu
       slack_report_channel: "#amd-hf-ci"
       runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: hf-transformers-bot/transformers-ci-dummy
     secrets: inherit
@@ -33,7 +33,7 @@ jobs:
       job: run_pipelines_torch_gpu
       slack_report_channel: "#amd-hf-ci"
       runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: hf-transformers-bot/transformers-ci-dummy
     secrets: inherit
@@ -45,7 +45,7 @@ jobs:
       job: run_examples_gpu
       slack_report_channel: "#amd-hf-ci"
       runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi355
       report_repo_id: hf-transformers-bot/transformers-ci-dummy
     secrets: inherit
diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
index ac5ec559516a..05cb4bebc6a4 100644
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -39,7 +39,7 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
     cd flash-attention && \
-    GPU_ARCHS="gfx942;gfx950" python setup.py install  
-# GPU_ARCHS builds for MI300, MI325 and MI355
+    GPU_ARCHS="gfx942" python setup.py install  
+# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.
 
 RUN python3 -m pip install --no-cache-dir einops

From 55938f4838d3706a3733b50b8606b56a38662f54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Ouazan?=
 <83456801+remi-or@users.noreply.github.com>
Date: Mon, 3 Nov 2025 18:05:26 +0100
Subject: [PATCH 33/56] More data in benchmarking (#41848)

* Reduce scope of cross-generate

* Rm generate_sall configs

* Workflow benchmarks more

* Prevent crash when FA is not installed
---
 .github/workflows/benchmark.yml            |  2 +-
 benchmark_v2/framework/benchmark_config.py | 47 +++++++++-------------
 benchmark_v2/run_benchmarks.py             | 12 +++++-
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 711931a24d5a..9b3f73a568c2 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -52,7 +52,7 @@ jobs:
             commit_id=$GITHUB_SHA
           fi
           commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --cross-generate --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
           PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
diff --git a/benchmark_v2/framework/benchmark_config.py b/benchmark_v2/framework/benchmark_config.py
index 314302a76699..7e66837c2465 100644
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@@ -3,6 +3,8 @@
 import logging
 from typing import Any
 
+from transformers.utils.import_utils import is_flash_attn_2_available
+
 
 KERNELIZATION_AVAILABLE = False
 try:
@@ -18,6 +20,16 @@
 class BenchmarkConfig:
     """Configuration for a single benchmark scenario."""
 
+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+
+    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
+
     def __init__(
         self,
         warmup_iterations: int = 5,
@@ -59,6 +71,13 @@ def __init__(
     def check_validity(self, skip_validity_check: bool = False) -> None:
         if skip_validity_check:
             return
+        # Check FA is installed
+        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
+            logger.warning(
+                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
+            )
+            self.attn_implementation = "sdpa"
+            self.sdpa_backend = "flash_attention"
         # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
         is_fa = self.attn_implementation == "flash_attention_2"
         is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
@@ -163,34 +182,6 @@ def cross_generate_configs(
     return configs
 
 
-def generate_all_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
-) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
-    )
-
-
 def generate_main_configs(
     warmup_iterations: int = 5,
     measurement_iterations: int = 20,
diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py
index 94a66ebc5a11..3b01af6017c4 100755
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@@ -23,7 +23,12 @@
 import sys
 import uuid
 
-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import (
+    KERNELIZATION_AVAILABLE,
+    BenchmarkConfig,
+    cross_generate_configs,
+    generate_main_configs,
+)
 from framework.benchmark_runner import BenchmarkRunner
 
 
@@ -82,7 +87,10 @@
     # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
     elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
         if args.cross_generate:
-            benchmark_configs = generate_all_configs(
+            benchmark_configs = cross_generate_configs(
+                attn_impl_and_sdpa_backend=BenchmarkConfig.all_attn_implementations,
+                compiled_mode=[None, "default"],  # usually there is not much to gain by compiling with other modes
+                kernelized=[False, KERNELIZATION_AVAILABLE],
                 warmup_iterations=args.warmup,
                 measurement_iterations=args.iterations,
                 batch_size=args.batch_size[0],

From 76fbe5a4602b8e6ed841a75dfcd9aaf7ab370c99 Mon Sep 17 00:00:00 2001
From: Guillaume LEGENDRE <glegendre01@gmail.com>
Date: Mon, 3 Nov 2025 18:16:32 +0100
Subject: [PATCH 34/56] fix (CI): Refactor SSH runners (#41991)

* Change ssh runner type

* Add wait step to SSH runner workflow

* Rename wait step to wait2 in ssh-runner.yml

* Remove wait step from ssh-runner.yml

Removed the wait step from the SSH runner workflow.

* Update runner type for single GPU A10 instance

* Update SSH runner version to 1.90.3

* Add sha256sum to ssh-runner workflow

* Update runner type and remove unused steps
---
 .github/workflows/ssh-runner.yml | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index 4fc22b44edc3..6300cd4105bc 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     inputs:
       runner_type:
-        description: 'Type of runner to test (a10 or t4)'
+        description: 'Type of runner to test (a10)'
         required: true
       docker_image:
         description: 'Name of the Docker image'
@@ -36,14 +36,10 @@ jobs:
           NUM_GPUS: ${{ github.event.inputs.num_gpus }}
           RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
         run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
           elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
           else
             echo "RUNNER=" >> $GITHUB_ENV
           fi
@@ -61,8 +57,6 @@ jobs:
       group: ${{ needs.get_runner.outputs.RUNNER }}
     container:
       image: ${{ github.event.inputs.docker_image }}
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -106,7 +100,7 @@ jobs:
           else
             echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
           fi
-
+        
       - name: Tailscale # In order to be able to SSH when a test fails
         uses: huggingface/tailscale-action@main
         with:

From 3b87190d3ceb29c6f469bad97629fe3aa7ec8589 Mon Sep 17 00:00:00 2001
From: kaixuanliu <kaixuan.liu@intel.com>
Date: Tue, 4 Nov 2025 01:18:20 +0800
Subject: [PATCH 35/56] fix 3 failed test cases for video_llama_3 model on
 Intel XPU (#41931)

* fix 3 failed test cases for video_llama_3 model on Intel XPU

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

* update

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

* adjust format

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

* update code

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

---------

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>
---
 .../test_modeling_video_llama_3.py            | 28 +++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py
index 5ad0cf9d7d4c..0a6ba10898d8 100644
--- a/tests/models/video_llama_3/test_modeling_video_llama_3.py
+++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py
@@ -36,6 +36,7 @@
     is_torch_available,
 )
 from transformers.testing_utils import (
+    Expectations,
     backend_empty_cache,
     require_flash_attn,
     require_torch,
@@ -831,7 +832,14 @@ def test_small_model_integration_test(self):
         torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
-        EXPECTED_DECODED_TEXT = "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress"
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                ("xpu", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+            }
+        ).get_expectation()
+        # fmt: on
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
@@ -874,11 +882,21 @@ def test_small_model_integration_test_batch_wo_image(self):
 
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+                ("xpu", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+            }
+        ).get_expectation()
+        # fmt: on
 
-        EXPECTED_DECODED_TEXT = [
-            "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
-            "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
-        ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,

From c33037bb282fefdb67e68040b4a67eccb36745aa Mon Sep 17 00:00:00 2001
From: Sahil Kabir <66221472+sahil-kabir@users.noreply.github.com>
Date: Mon, 3 Nov 2025 18:31:07 -0500
Subject: [PATCH 36/56] Integrate colqwen2.5 using colqwen2 modelling code
 (#40600)

* adding option for 2.5

* minor - arg in conversion script

* getting started on modelling.py

* minor - shouldve been using modular

* adressing comments + fixing datatype/device _get method

* minor

* commiting suggestion

Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>

* docs + first test

* ruff fix

* minor fix

* ruff fix

* model fix

* model fix

* fine-grained check, with a hardcoded score from the original Hf implementation.

* minor ruff

* update tests values with CI hardware

* adding 2.5 to conversion script

* Apply style fixes

---------

Co-authored-by: Sahil Kabir <sahilkabir@Sahils-MacBook-Pro.local>
Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 docs/source/en/model_doc/colqwen2.md          | 18 ++++++
 .../convert_colqwen2_weights_to_hf.py         | 22 +++++++-
 .../models/colqwen2/modeling_colqwen2.py      |  1 -
 .../models/colqwen2/modular_colqwen2.py       |  1 -
 .../models/colqwen2/test_modeling_colqwen2.py | 55 ++++++++++++++++++-
 5 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/colqwen2.md b/docs/source/en/model_doc/colqwen2.md
index 6810ca529a04..7c9a9627e2c7 100644
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@@ -158,6 +158,24 @@ print("Retrieval scores (query x image):")
 print(scores)
 ```
 
+You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
+
+```python
+import torch
+from transformers import ColQwen2ForRetrieval, ColQwen2Processor
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+)
+processor = ColQwen2Processor.from_pretrained(model_name)
+```
+
 ## Notes
 
 - [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
diff --git a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py b/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
index ca990a6d42d4..e8fbc502466c 100644
--- a/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
+++ b/src/transformers/models/colqwen2/convert_colqwen2_weights_to_hf.py
@@ -39,9 +39,10 @@
 
 import torch
 from huggingface_hub import snapshot_download
+from peft import PeftModel
 from safetensors import safe_open
 
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoModel
 from transformers.models.colqwen2 import ColQwen2ForRetrieval
 from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config
 from transformers.utils import logging
@@ -69,7 +70,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
                     original_state_dict[key] = f.get_tensor(key)
 
     # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
-    if "lm_head.weight" not in original_state_dict:
+    if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
         original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
 
     return original_state_dict
@@ -124,7 +125,21 @@ def convert_colqwen2_weights_to_hf(
     config.is_composition = False
 
     # Load the untrained model
-    model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
+    vlm_name_or_path = getattr(config.vlm_config, "_name_or_path", None)
+    if vlm_name_or_path and "2.5" in str(vlm_name_or_path):
+        print(
+            "Detected colqwen2.5 adapters in vlm_config; loading base model %s and merging PEFT weights."
+            % vlm_name_or_path
+        )
+        base_model = AutoModel.from_pretrained(
+            vlm_name_or_path,
+            device_map="cpu",
+            trust_remote_code=True,
+        )
+        peft_model = PeftModel.from_pretrained(base_model, model_id)
+        model = peft_model.merge_and_unload()
+    else:
+        model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
     print("Created model with new config and randomly initialized weights")
 
     # NOTE: The new model was initialized with float32 weights. We need to convert it to the desired precision.
@@ -201,6 +216,7 @@ def convert_colqwen2_weights_to_hf(
         help="Name or path of the original VLM backbone model",
         default=None,
     )
+
     args = parser.parse_args()
 
     convert_colqwen2_weights_to_hf(
diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py
index 0c22fb99c887..c3a6c04ee4db 100644
--- a/src/transformers/models/colqwen2/modeling_colqwen2.py
+++ b/src/transformers/models/colqwen2/modeling_colqwen2.py
@@ -172,7 +172,6 @@ def forward(
             inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
 
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                 image_mask = (
                     (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
index 15a99c7efe25..a96ecc6c7416 100644
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -356,7 +356,6 @@ def forward(
             inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
 
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
                 image_mask = (
                     (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
diff --git a/tests/models/colqwen2/test_modeling_colqwen2.py b/tests/models/colqwen2/test_modeling_colqwen2.py
index 790cf639c985..4d9d24703682 100644
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@@ -335,12 +335,61 @@ def test_model_integration_test(self):
                     [15.6562, 12.2656, 20.2969],
                 ],
                 ("cuda", 8): [
-                    [15.0703, 8.7422, 15.0312],
-                    [9.5078, 16.8906, 10.6250],
-                    [15.6484, 12.3984, 20.4688],
+                    [16.2812, 8.3672, 14.5703],
+                    [9.4922, 17.1875, 10.3281],
+                    [15.0312, 11.3984, 20.1719],
                 ],
             }
         )
         expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
 
         assert torch.allclose(scores, expected_scores, atol=1e-3), f"Expected scores {expected_scores}, got {scores}"
+
+    @slow
+    def test_model_integration_test_2(self):
+        """
+        Test if the model is able to retrieve the correct pages for a small and easy dataset.
+        This test uses a ColQwen2.5 checkpoint that is compatible with the ColQwen2 architecture.
+        """
+        model = ColQwen2ForRetrieval.from_pretrained(
+            "Sahil-Kabir/colqwen2.5-v0.2-hf",
+            device_map=torch_device,
+            dtype=torch.bfloat16,
+        ).eval()
+        processor = ColQwen2Processor.from_pretrained("Sahil-Kabir/colqwen2.5-v0.2-hf", trust_remote_code=True)
+
+        # Load the test dataset
+        ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test")
+
+        # Preprocess the examples
+        batch_images = processor(images=list(ds["image"])).to(torch_device)
+        batch_queries = processor(text=list(ds["query"])).to(torch_device)
+
+        with torch.inference_mode():
+            image_embeddings = model(**batch_images).embeddings
+            query_embeddings = model(**batch_queries).embeddings
+
+        # Compute retrieval scores
+        scores = processor.score_retrieval(
+            query_embeddings=query_embeddings,
+            passage_embeddings=image_embeddings,
+        )
+
+        assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}"
+        assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}"
+
+        # Check if the maximum scores per row are in the diagonal of the matrix score
+        self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all())
+        # Further validation: fine-grained check, with a hardcoded score from the original Hf implementation.
+        expectations = Expectations(
+            {
+                ("cuda", 8): [
+                    [16.3750, 10.9375, 14.7500],
+                    [11.3750, 16.8750, 12.0625],
+                    [15.3125, 13.1250, 21.5000],
+                ]
+            }
+        )
+        expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
+
+        assert torch.allclose(scores, expected_scores, atol=0.15), f"Expected scores {expected_scores}, got {scores}"

From de108405d0c10e870eb71ad74ec34942828aaadd Mon Sep 17 00:00:00 2001
From: James <67161633+gjamesgoenawan@users.noreply.github.com>
Date: Tue, 4 Nov 2025 07:46:28 +0800
Subject: [PATCH 37/56] Fixed wrong padding value in OWLv2 (#41938)

* Update image_processing_owlv2_fast.py

fixed padding value

* fixed padding value

* Change padding constant value from 0.5 to 0.0

* Fixed missed padding value in modular_owlv2.py

---------

Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
---
 src/transformers/models/owlv2/image_processing_owlv2.py     | 2 +-
 .../models/owlv2/image_processing_owlv2_fast.py             | 6 +++---
 src/transformers/models/owlv2/modular_owlv2.py              | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index a79cc57a6c94..a272c8fdbce7 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -291,7 +291,7 @@ def pad(
         image = pad(
             image=image,
             padding=((0, size - height), (0, size - width)),
-            constant_values=0.5,
+            constant_values=0.0,
             data_format=data_format,
             input_data_format=input_data_format,
         )
diff --git a/src/transformers/models/owlv2/image_processing_owlv2_fast.py b/src/transformers/models/owlv2/image_processing_owlv2_fast.py
index 25022f4f6c8c..f1a8a79fb81e 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2_fast.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2_fast.py
@@ -228,7 +228,7 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
 
         return results
 
-    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
+    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
         """
         Pad an image with zeros to the given size.
         """
@@ -245,7 +245,7 @@ def pad(
         self,
         images: list["torch.Tensor"],
         disable_grouping: Optional[bool],
-        constant_value: float = 0.5,
+        constant_value: float = 0.0,
         **kwargs,
     ) -> list["torch.Tensor"]:
         """
@@ -351,7 +351,7 @@ def _preprocess(
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
 
         if do_pad:
-            processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
+            processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)
 
         grouped_images, grouped_images_index = group_images_by_shape(
             processed_images, disable_grouping=disable_grouping
diff --git a/src/transformers/models/owlv2/modular_owlv2.py b/src/transformers/models/owlv2/modular_owlv2.py
index c58db1efd46e..590fa5b4b31c 100644
--- a/src/transformers/models/owlv2/modular_owlv2.py
+++ b/src/transformers/models/owlv2/modular_owlv2.py
@@ -52,7 +52,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
     crop_size = None
     do_center_crop = None
 
-    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
+    def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
         """
         Pad an image with zeros to the given size.
         """
@@ -69,7 +69,7 @@ def pad(
         self,
         images: list["torch.Tensor"],
         disable_grouping: Optional[bool],
-        constant_value: float = 0.5,
+        constant_value: float = 0.0,
         **kwargs,
     ) -> list["torch.Tensor"]:
         """
@@ -175,7 +175,7 @@ def _preprocess(
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
 
         if do_pad:
-            processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
+            processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)
 
         grouped_images, grouped_images_index = group_images_by_shape(
             processed_images, disable_grouping=disable_grouping

From f639ad6a158f76dfdf2d4112a9eb5eeb51176d29 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 4 Nov 2025 06:46:21 +0100
Subject: [PATCH 38/56] Fix `run slow v2`: empty report when there is only one
 model (#42002)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-scheduled.yml | 6 ++++--
 utils/split_model_tests.py           | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index ccf6aa962915..83c4763545bf 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -102,8 +102,10 @@ jobs:
         working-directory: /transformers/tests
         run: |
           if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+            python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }} > folder_slices.txt
+            echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
+            python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
+            echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
           elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
             echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
             echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py
index 344dc5449f35..40a37ab94b13 100644
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@@ -81,6 +81,8 @@
     for idx in range(args.num_splits):
         start = end
         end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
-        model_splits.append(d[start:end])
+        # Only add the slice if it is not an empty list
+        if len(d[start:end]) > 0:
+            model_splits.append(d[start:end])
 
     print(model_splits)

From 135543aee71babbe22df252b9553a3610a8e6751 Mon Sep 17 00:00:00 2001
From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:26:24 +0100
Subject: [PATCH 39/56] [kernels] change import time in KernelConfig (#42004)

* change import time

* style
---
 src/transformers/utils/kernel_config.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/utils/kernel_config.py b/src/transformers/utils/kernel_config.py
index aa9adab2f29c..fe9f368ac8e7 100644
--- a/src/transformers/utils/kernel_config.py
+++ b/src/transformers/utils/kernel_config.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import PushToHubMixin, is_kernels_available, is_torch_available
+from ..utils import PushToHubMixin, is_torch_available
 
 
-if is_kernels_available():
-    from kernels import LayerRepository, Mode
-
 if is_torch_available():
     import torch
 
@@ -58,6 +55,8 @@ def infer_device(model):
 
 
 def add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping):
+    from kernels import LayerRepository
+
     if device not in ["cuda", "rocm", "xpu"]:
         raise ValueError(f"Only cuda, rocm, and xpu devices supported, got: {device}")
     repo_layer_name = repo_name.split(":")[1]
@@ -82,6 +81,8 @@ def __init__(self, kernel_mapping={}):
         self.registered_layer_names = {}
 
     def update_kernel(self, repo_id, registered_name, layer_name, device, mode, revision=None):
+        from kernels import LayerRepository
+
         self.kernel_mapping[registered_name] = {
             device: {
                 mode: LayerRepository(
@@ -204,6 +205,8 @@ def create_compatible_mapping(self, model, compile=False):
         The device is inferred from the model's parameters if not provided.
         The Mode is inferred from the model's training state.
         """
+        from kernels import Mode
+
         compatible_mapping = {}
         for layer_name, kernel in self.kernel_mapping.items():
             # Infer Mode: use Mode.TRAINING if model is training, else use Mode.INFERENCE

From adf6777333d181d3efb5402ce2109f829c4e92b1 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:48:39 +0100
Subject: [PATCH 40/56] DOC Fix typo in argument name: pseudoquant (#41994)

The correct argument name is pseudoquantization. Since there is no error
on passing wrong arguments name (which is arguably an anti-pattern),
this is difficult for users to debug.
---
 docs/source/en/quantization/fp_quant.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/quantization/fp_quant.md b/docs/source/en/quantization/fp_quant.md
index 4888795a6d77..5e4a709c4017 100644
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"
 
 A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.
 
-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
 
 > [!TIP]
 > Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).

From f37903bc8488fb3b7e7aecbbb98136fa6365c643 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 4 Nov 2025 11:41:22 +0100
Subject: [PATCH 41/56] Fix `torch+deepspeed` docker file (#41985)

* fix

* delete

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-docker-images.yml                   | 2 +-
 .github/workflows/self-scheduled.yml                        | 4 ++--
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 69afe4677a42..4174157b4fbd 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -97,7 +97,7 @@ jobs:
   latest-torch-deepspeed-docker:
     name: "Latest PyTorch + DeepSpeed"
     runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 83c4763545bf..342cb3ad94d3 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -338,7 +338,7 @@ jobs:
         working-directory: ${{ inputs.working-directory-prefix }}/
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
 
       # To avoid unknown test failures
       - name: Pre build DeepSpeed *again* (for nightly & Past CI)
@@ -348,7 +348,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 41611b350def..bb1bc830eeaf 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
@@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels

From 6a5d5ce4b8642ded2c5dc41cd9b2bd243f865fc9 Mon Sep 17 00:00:00 2001
From: Yacklin Wong <139425274+Yacklin@users.noreply.github.com>
Date: Tue, 4 Nov 2025 08:36:54 -0400
Subject: [PATCH 42/56] Correct syntax error in trainer.md (#42001)

A comma is missing between two parameters in the signature of compute_loss function.
---
 docs/source/en/trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 045f0837c334..1d700d398b5c 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -187,7 +187,7 @@ from torch import nn
 from transformers import Trainer
 
 class CustomTrainer(Trainer):
-    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
         labels = inputs.pop("labels")
         # forward pass
         outputs = model(**inputs)

From 1f8ae37fea42f03c4a754d920504a0c191e2fed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Ouazan?=
 <83456801+remi-or@users.noreply.github.com>
Date: Tue, 4 Nov 2025 14:07:17 +0100
Subject: [PATCH 43/56] Reduce the number of benchmark in the CI (#42008)

Changed how benchmark cfgs are chosen
---
 .github/workflows/benchmark.yml            |   2 +-
 benchmark_v2/framework/benchmark_config.py | 117 +++++++++++----------
 benchmark_v2/run_benchmarks.py             |  89 +++++-----------
 3 files changed, 88 insertions(+), 120 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9b3f73a568c2..b54ca69737df 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -52,7 +52,7 @@ jobs:
             commit_id=$GITHUB_SHA
           fi
           commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --cross-generate --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
           PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
diff --git a/benchmark_v2/framework/benchmark_config.py b/benchmark_v2/framework/benchmark_config.py
index 7e66837c2465..52e6f89956ed 100644
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@@ -1,4 +1,5 @@
 import hashlib
+import itertools
 import json
 import logging
 from typing import Any
@@ -146,60 +147,68 @@ def from_dict(cls, data: dict[str, Any], skip_validity_check: bool = False) -> "
         )
 
 
-def cross_generate_configs(
-    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
-    compiled_mode: list[str | None],
-    kernelized: list[bool],
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+def adapt_configs(
+    configs: list[BenchmarkConfig],
+    warmup_iterations: int | list[int] = 5,
+    measurement_iterations: int | list[int] = 20,
+    batch_size: int | list[int] = 1,
+    sequence_length: int | list[int] = 128,
+    num_tokens_to_generate: int | list[int] = 128,
+    gpu_monitoring: bool | list[bool] = True,
 ) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-        "gpu_monitoring": gpu_monitoring,
-    }
-    # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
+    parameters = (
+        x if isinstance(x, list) else [x]
+        for x in [
+            warmup_iterations,
+            measurement_iterations,
+            batch_size,
+            sequence_length,
+            num_tokens_to_generate,
+            gpu_monitoring,
+        ]
+    )
+    iterator = itertools.product(*parameters)
+
+    adapted_configs = []
+    for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
+        for config in configs:
+            config = config.to_dict()
+            config["warmup_iterations"] = warmup_iters
+            config["measurement_iterations"] = measurement_iters
+            config["batch_size"] = bs
+            config["sequence_length"] = seqlen
+            config["num_tokens_to_generate"] = ntok
+            config["gpu_monitoring"] = monitor
+            adapted_configs.append(BenchmarkConfig.from_dict(config))
+    return adapted_configs
+
+
+def get_config_by_level(level: int) -> list[BenchmarkConfig]:
     configs = []
-    for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
-        for cm in list(dict.fromkeys(compiled_mode)):
-            for kernelize_on in list(dict.fromkeys(kernelized)):
-                config = BenchmarkConfig(
-                    attn_implementation=attn_implementation,
-                    sdpa_backend=sdpa_backend,
-                    compile_mode=cm,
-                    kernelize=kernelize_on,
-                    **kwargs,
-                )
-                configs.append(config)
+    # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
+    if level >= 3:
+        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
+            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
+            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
+            for cm in compile_modes:
+                for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
+                    configs.append(
+                        BenchmarkConfig(
+                            attn_implementation=attn_implementation,
+                            sdpa_backend=sdpa_backend,
+                            compile_mode=cm,
+                            kernelize=kernelize_on,
+                        )
+                    )
+        return configs
+    # Otherwise, we add the configs for the given level
+    if level >= 0:
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
+    if level >= 1:
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
+        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
+    if level >= 2:
+        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
     return configs
-
-
-def generate_main_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-    }
-    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
-    ]
diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py
index 3b01af6017c4..93a6628085cf 100755
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@@ -23,12 +23,7 @@
 import sys
 import uuid
 
-from framework.benchmark_config import (
-    KERNELIZATION_AVAILABLE,
-    BenchmarkConfig,
-    cross_generate_configs,
-    generate_main_configs,
-)
+from framework.benchmark_config import adapt_configs, get_config_by_level
 from framework.benchmark_runner import BenchmarkRunner
 
 
@@ -45,7 +40,14 @@
     parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
     parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
 
-    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
+    parser.add_argument(
+        "--level",
+        type=int,
+        default=1,
+        help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
+        " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
+        " combinations of configs w/ all compile modes",
+    )
     parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
 
     parser.add_argument("--branch-name", type=str, help="Git branch name")
@@ -84,67 +86,24 @@
             "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
         )
 
-    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
-    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        if args.cross_generate:
-            benchmark_configs = cross_generate_configs(
-                attn_impl_and_sdpa_backend=BenchmarkConfig.all_attn_implementations,
-                compiled_mode=[None, "default"],  # usually there is not much to gain by compiling with other modes
-                kernelized=[False, KERNELIZATION_AVAILABLE],
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-                gpu_monitoring=not args.no_gpu_monitoring,
-            )
-        else:
-            benchmark_configs = generate_main_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-            )
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        main_config = generate_main_configs(
-            warmup_iterations=args.warmup,
-            measurement_iterations=args.iterations,
-            batch_size=args.batch_size[0],
-            sequence_length=args.sequence_length[0],
-            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )[0]
-        benchmark_configs = []
-        for num_tokens_to_generate in args.num_tokens_to_generate:
-            for sequence_length in args.sequence_length:
-                for batch_size in args.batch_size:
-                    cfg_dict = main_config.to_dict()
-                    cfg_dict["batch_size"] = batch_size
-                    cfg_dict["sequence_length"] = sequence_length
-                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
-                    cfg_dict.pop("name")
-                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
-
-    runner = BenchmarkRunner(
-        logger,
-        args.output_dir,
-        args.branch_name,
-        args.commit_id,
-        args.commit_message,
+    # Get the configs for the given coverage level
+    configs = get_config_by_level(args.level)
+    # Adapt the configs to the given arguments
+    configs = adapt_configs(
+        configs,
+        args.warmup,
+        args.iterations,
+        args.batch_size,
+        args.sequence_length,
+        args.num_tokens_to_generate,
+        not args.no_gpu_monitoring,
     )
+
+    runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
     timestamp, results = runner.run_benchmarks(
-        args.model_id,
-        benchmark_configs,
-        args.num_tokens_to_profile,
-        pretty_print_summary=True,
+        args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
     )
 
     dataset_id = args.push_result_to_dataset
     if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(
-            dataset_id,
-            results,
-            timestamp,
-        )
+        runner.push_results_to_hub(dataset_id, results, timestamp)

From 9488b268f52567de1425a4d4311d7dfd43e2e23d Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 4 Nov 2025 15:10:35 +0000
Subject: [PATCH 44/56] Fix continuous batching tests (#42012)

* Fix continuous batching tests

* make fixup
---
 tests/generation/test_continuous_batching.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/generation/test_continuous_batching.py b/tests/generation/test_continuous_batching.py
index 80da7886dccf..76788c5e4224 100644
--- a/tests/generation/test_continuous_batching.py
+++ b/tests/generation/test_continuous_batching.py
@@ -350,9 +350,9 @@ def test_streaming_request(self) -> None:
 
         messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
 
-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]
 
         request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=True)
 
@@ -382,9 +382,9 @@ def test_non_streaming_request(self) -> None:
 
         messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
 
-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]
 
         request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)
 
@@ -409,9 +409,9 @@ def test_streaming_and_non_streaming_requests_can_alternate(self) -> None:
 
         messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
 
-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
-            model.device
-        )[0]
+        inputs = tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
+        ).to(model.device)[0]
 
         # Non-streaming request
         request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)

From 0a703ee23edc379df17e39e36436b2de6fe37f31 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 4 Nov 2025 16:22:58 +0100
Subject: [PATCH 45/56] add back `logging_dir` (#42013)

* add back

* Apply style fixes

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/transformers/training_args.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index c57fcafdfcc6..2c3d9e2cd866 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -885,6 +885,12 @@ class TrainingArguments:
             )
         },
     )
+    logging_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Deprecated and will be removed in v5.2. Set env var `TENSORBOARD_LOGGING_DIR` instead. TensorBoard log directory."
+        },
+    )
     logging_strategy: Union[IntervalStrategy, str] = field(
         default="steps",
         metadata={"help": "The logging strategy to use."},
@@ -1695,6 +1701,11 @@ def __post_init__(self):
         if isinstance(self.include_num_input_tokens_seen, bool):
             self.include_num_input_tokens_seen = "all" if self.include_num_input_tokens_seen else "no"
 
+        if self.logging_dir is not None:
+            logger.warning(
+                "`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead."
+            )
+
     def __str__(self):
         self_as_dict = asdict(self)
 

From 2f2a82ce9cfae634c708703d286bee7343fdbe8f Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:35:39 -0500
Subject: [PATCH 46/56] Fix issue with from pretrained and kwargs in image
 processors (#41997)

* accept kwargs in image proc from_pretrained

* only use kwargs that are in cls.valid_kwargs

* remove specific logic for _from_auto

* add image_seq_length to Images_kwargs for backward compatibility

* fix missing image kwargs in pix2struct
---
 src/transformers/image_processing_base.py     | 20 ++++---------------
 .../image_processing_utils_fast.py            |  1 +
 .../pix2struct/image_processing_pix2struct.py |  7 +++++++
 src/transformers/processing_utils.py          |  4 ++++
 .../pix2struct/test_processing_pix2struct.py  |  1 +
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
index 7960d1113d55..60774390bf23 100644
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@@ -362,25 +362,13 @@ def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
         """
         image_processor_dict = image_processor_dict.copy()
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
-        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
-        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
-        if "size" in kwargs and "size" in image_processor_dict:
-            image_processor_dict["size"] = kwargs.pop("size")
-        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
-            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
-
+        image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
         image_processor = cls(**image_processor_dict)
 
-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
+        # Remove kwargs that are used to initialize the image processor attributes
+        for key in list(kwargs):
             if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
+                kwargs.pop(key)
 
         logger.info(f"Image processor {image_processor}")
         if return_unused_kwargs:
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index a145754d3209..45741efd9517 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
     input_data_format = None
     device = None
     model_input_names = ["pixel_values"]
+    image_seq_length = None
     valid_kwargs = ImagesKwargs
     unused_kwargs = None
 
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index 3ec36ebda440..ec5645ee4bb9 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
     """
     max_patches (`int`, *optional*):
         Maximum number of patches to extract.
+    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
+    is_vqa (`bool`, *optional*, defaults to `False`):
+        Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
+        rendered onto the input images.
     header_text (`Union[list[str], str]`, *optional*):
         Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
     """
 
     max_patches: int
+    patch_size: dict[str, int]
+    is_vqa: bool
     header_text: Optional[Union[list[str], str]]
 
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ae78255c6ee1..4b670e942dde 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -253,6 +253,9 @@ class methods and docstrings.
             - `'np'`: Return NumPy `np.ndarray` objects.
         disable_grouping (`bool`, *optional*):
             Whether to group images by shapes when processing or not, only relevant for fast image processing.
+        image_seq_length (`int`, *optional*):
+            The number of image tokens to be used for each image in the input.
+            Added for backward compatibility but this should be set as a processor attribute in future models.
     """
 
     do_convert_rgb: Optional[bool]
@@ -273,6 +276,7 @@ class methods and docstrings.
     device: Annotated[Optional[str], device_validator()]
     return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
     disable_grouping: Optional[bool]
+    image_seq_length: Optional[int]
 
 
 class VideosKwargs(TypedDict, total=False):
diff --git a/tests/models/pix2struct/test_processing_pix2struct.py b/tests/models/pix2struct/test_processing_pix2struct.py
index faad04bb4843..e93f91f5b93b 100644
--- a/tests/models/pix2struct/test_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_processing_pix2struct.py
@@ -172,6 +172,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.get_attributes():
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
+        print("image_processor", image_processor)
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)

From 0143c60a066dbe5a05b80e1fb928bccd2b7c365d Mon Sep 17 00:00:00 2001
From: MilkClouds <claude@maum.ai>
Date: Wed, 5 Nov 2025 00:42:47 +0900
Subject: [PATCH 47/56] Fix default image_rows and image_cols initialization in
 Idefics3 and SmolVLM processors (#41871)

* Fix default image_rows and image_cols initialization in Idefics3 and SmolVLM processors

* Fix default initialization of image_rows and image_cols in Idefics3 and SmolVLM processors
---
 src/transformers/models/idefics3/processing_idefics3.py | 4 ++--
 src/transformers/models/smolvlm/processing_smolvlm.py   | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 373e3e3ed9f3..5c978eb3b230 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -282,8 +282,8 @@ def __call__(
                         f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                     )
 
-                image_rows = inputs.pop("rows", [[0] * len(text)])
-                image_cols = inputs.pop("cols", [[0] * len(text)])
+                image_rows = inputs.pop("rows", [[0] * n_images for n_images in n_images_in_text])
+                image_cols = inputs.pop("cols", [[0] * n_images for n_images in n_images_in_text])
 
                 fake_image_token = self.fake_image_token
                 image_token = self.image_token
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 9d1d64a65efe..2ce6465ee971 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -167,8 +167,6 @@ def __init__(
 
     def expand_text_with_image_tokens(self, text, image_rows, image_cols):
         prompt_strings = []
-        image_rows = image_rows if image_rows is not None else [[0] * len(text)]
-        image_cols = image_cols if image_cols is not None else [[0] * len(text)]
         for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
             # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
             image_prompt_strings = []
@@ -325,6 +323,11 @@ def __call__(
                     raise ValueError(
                         f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                     )
+                # Set default values for image_rows and image_cols if not provided
+                if image_rows is None:
+                    image_rows = [[0] * n_images for n_images in n_images_in_text]
+                if image_cols is None:
+                    image_cols = [[0] * n_images for n_images in n_images_in_text]
                 text = self.expand_text_with_image_tokens(text, image_rows=image_rows, image_cols=image_cols)
 
         elif videos is not None:

From af380ffb777d9042319bd7f00c699455d05d512e Mon Sep 17 00:00:00 2001
From: ARAVINDHAN T <arvindhant01@gmail.com>
Date: Tue, 4 Nov 2025 07:44:52 -0800
Subject: [PATCH 48/56] Add GLPNImageProcessorFast  (#41725)

* Add GLPNImageProcessorFast for torch backend

* Address review feedback

- Simplified to_dict() method
- Keep tensors as torch instead of converting to numpy for heterogeneous shapes
- Removed unnecessary shape guards in post_process_depth_estimation
- Improved variable names (tgt -> target_size, d -> resized)
- Removed unnecessary GLPNImageProcessorKwargs class

* Address review feedback

- Simplified to_dict() method
- Keep tensors as torch instead of converting to numpy for heterogeneous shapes
- Removed unnecessary shape guards in post_process_depth_estimation
- Improved variable names (tgt -> target_size, d -> resized)
- Removed unnecessary GLPNImageProcessorKwargs class

* commits after 2nd review

* Address all review feedback and add explicit batched test

- Simplified to_dict() with descriptive variable names (d->output_dict)
- Fixed resize operation: changed from crop to proper resize with interpolation
- Added padding for heterogeneous batch shapes in both slow and fast processors
- Fused rescale and normalize operations for efficiency
- Improved all variable names (tgt->target_size, d->depth_4d->resized)
- Added GLPNImageProcessorKwargs class in slow processor and imported in fast
- Renamed test_equivalence_slow_fast to test_slow_fast_equivalence
- Added explicit test_slow_fast_equivalence_batched test
- All 20 tests passing

* using padding from utils

* simplify glpn image processor fast

* fix docstring

---------

Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
---
 docs/source/en/model_doc/glpn.md              |   5 +
 .../image_processing_utils_fast.py            |   2 +
 .../models/auto/image_processing_auto.py      |   2 +-
 src/transformers/models/glpn/__init__.py      |   1 +
 .../models/glpn/image_processing_glpn.py      |  23 ++-
 .../models/glpn/image_processing_glpn_fast.py | 136 ++++++++++++++++++
 .../models/glpn/test_image_processing_glpn.py |  67 ++++++++-
 7 files changed, 228 insertions(+), 8 deletions(-)
 create mode 100644 src/transformers/models/glpn/image_processing_glpn_fast.py

diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md
index 8eb2c338a456..8081a6e0c66f 100644
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GLPNImageProcessor
     - preprocess
 
+## GLPNImageProcessorFast
+
+[[autodoc]] GLPNImageProcessorFast
+    - preprocess
+
 ## GLPNModel
 
 [[autodoc]] GLPNModel
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 45741efd9517..6ffc8882c9bb 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -306,6 +306,8 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing.
 
         Returns:
             `torch.Tensor`: The resized image.
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ae37c52331a8..4b0dd8e8d2ad 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -109,7 +109,7 @@
             ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
             ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
-            ("glpn", ("GLPNImageProcessor", None)),
+            ("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
             ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
             ("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
             ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
diff --git a/src/transformers/models/glpn/__init__.py b/src/transformers/models/glpn/__init__.py
index 2a5b38675c34..8d81194031c7 100644
--- a/src/transformers/models/glpn/__init__.py
+++ b/src/transformers/models/glpn/__init__.py
@@ -21,6 +21,7 @@
     from .configuration_glpn import *
     from .feature_extraction_glpn import *
     from .image_processing_glpn import *
+    from .image_processing_glpn_fast import *
     from .modeling_glpn import *
 else:
     import sys
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 35306eabc8d5..a50940840034 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -39,6 +39,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends
 
 
@@ -49,6 +50,17 @@
 logger = logging.get_logger(__name__)
 
 
+class GLPNImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    size_divisor (`int`, *optional*, defaults to 32):
+        When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
+        multiple of `size_divisor`.
+    """
+
+    size_divisor: int
+    resample: PILImageResampling
+
+
 @requires(backends=("vision",))
 class GLPNImageProcessor(BaseImageProcessor):
     r"""
@@ -66,9 +78,12 @@ class GLPNImageProcessor(BaseImageProcessor):
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
             overridden by `do_rescale` in `preprocess`.
+        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
+            The scaling factor to apply to the pixel values. Can be overridden by `rescale_factor` in `preprocess`.
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = GLPNImageProcessorKwargs
 
     def __init__(
         self,
@@ -76,12 +91,14 @@ def __init__(
         size_divisor: int = 32,
         resample=PILImageResampling.BILINEAR,
         do_rescale: bool = True,
+        rescale_factor: Optional[float] = 1 / 255,
         **kwargs,
     ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
         self.size_divisor = size_divisor
         self.resample = resample
+        self.rescale_factor = rescale_factor
         super().__init__(**kwargs)
 
     def resize(
@@ -142,6 +159,7 @@ def preprocess(
         size_divisor: Optional[int] = None,
         resample=None,
         do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -181,6 +199,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
 
@@ -217,7 +236,9 @@ def preprocess(
             ]
 
         if do_rescale:
-            images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images]
+            images = [
+                self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images
+            ]
 
         images = [
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
diff --git a/src/transformers/models/glpn/image_processing_glpn_fast.py b/src/transformers/models/glpn/image_processing_glpn_fast.py
new file mode 100644
index 000000000000..a906dc29c271
--- /dev/null
+++ b/src/transformers/models/glpn/image_processing_glpn_fast.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for GLPN."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from ...image_utils import (
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    requires_backends,
+)
+from .image_processing_glpn import GLPNImageProcessorKwargs
+
+
+@auto_docstring
+class GLPNImageProcessorFast(BaseImageProcessorFast):
+    do_resize = True
+    do_rescale = True
+    rescale_factor = 1 / 255
+    resample = PILImageResampling.BILINEAR
+    size_divisor = 32
+    valid_kwargs = GLPNImageProcessorKwargs
+
+    def _validate_preprocess_kwargs(self, **kwargs):
+        # pop `do_resize` to not raise an error as `size` is not None
+        kwargs.pop("do_resize", None)
+        return super()._validate_preprocess_kwargs(**kwargs)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size_divisor: int,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        height, width = image.shape[-2:]
+        # Rounds the height and width down to the closest multiple of size_divisor
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        return super().resize(
+            image, SizeDict(height=new_h, width=new_w), interpolation=interpolation, antialias=antialias
+        )
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size_divisor: Optional[int] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        do_rescale: bool = True,
+        rescale_factor: Optional[float] = 1 / 255,
+        do_normalize: bool = False,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        disable_grouping: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        resample: Optional[PILImageResampling] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        grouped_images, grouped_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_groups = {}
+
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(stacked_images, size_divisor=size_divisor, interpolation=interpolation)
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_groups[shape] = stacked_images
+
+        processed_images = reorder_images(processed_groups, grouped_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(self, outputs, target_sizes=None):
+        """
+        Convert raw model outputs to final depth predictions.
+        Mirrors slow GLPN: PyTorch interpolate w/ bicubic, align_corners=False.
+        """
+        requires_backends(self, "torch")
+        predicted_depth = outputs.predicted_depth
+
+        results = []
+        target_sizes = target_sizes or [None] * predicted_depth.shape[0]
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                # Add batch and channel dimensions for interpolation
+                depth_4d = depth[None, None, ...]
+                resized = torch.nn.functional.interpolate(
+                    depth_4d, size=target_size, mode="bicubic", align_corners=False
+                )
+                depth = resized.squeeze(0).squeeze(0)
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["GLPNImageProcessorFast"]
diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py
index 7f6a960755e7..396f7e9543e7 100644
--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
@@ -31,6 +31,9 @@
 
     from transformers import GLPNImageProcessor
 
+    if is_torchvision_available():
+        from transformers import GLPNImageProcessorFast
+
 
 class GLPNImageProcessingTester:
     def __init__(
@@ -87,19 +90,32 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
             torchify=torchify,
         )
 
+    def prepare_depth_outputs(self):
+        if not is_torch_available():
+            return None
+        depth_tensors = prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=1,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=True,
+            torchify=True,
+        )
+        depth_tensors = [depth_tensor.squeeze(0) for depth_tensor in depth_tensors]
+        stacked_depth_tensors = torch.stack(depth_tensors, dim=0)
+        return type("DepthOutput", (), {"predicted_depth": stacked_depth_tensors})
+
 
 @require_torch
 @require_vision
 class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GLPNImageProcessor if is_vision_available() else None
+    fast_image_processing_class = GLPNImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()
         self.image_processor_tester = GLPNImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
+        self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
         image_processing = self.image_processing_class(**self.image_processor_dict)
@@ -115,7 +131,6 @@ def test_call_pil(self):
         image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
         for image in image_inputs:
             self.assertIsInstance(image, Image.Image)
-
         # Test not batched input (GLPNImageProcessor doesn't support batching)
         encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
@@ -161,3 +176,43 @@ def test_call_numpy_4_channels(self):
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
         self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
         self.image_processing_class.num_channels = 3
+
+    # override as glpn image processors don't support heterogeneous batching
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
+
+    def test_post_process_depth_equivalence(self):
+        # Check that both processors produce equivalent post-processed depth maps
+        if self.fast_image_processing_class is None:
+            self.skipTest("TorchVision not available")
+
+        outputs = self.image_processor_tester.prepare_depth_outputs()
+        slow = self.image_processing_class(**self.image_processor_dict)
+        fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        # target_sizes simulate resized inference outputs
+        target_sizes = [(240, 320)] * self.image_processor_tester.batch_size
+        processed_slow = slow.post_process_depth_estimation(outputs, target_sizes=target_sizes)
+        processed_fast = fast.post_process_depth_estimation(outputs, target_sizes=target_sizes)
+
+        # Compare per-sample predicted depth tensors
+        for pred_slow, pred_fast in zip(processed_slow, processed_fast):
+            depth_slow = pred_slow["predicted_depth"]
+            depth_fast = pred_fast["predicted_depth"]
+            torch.testing.assert_close(depth_fast, depth_slow, atol=1e-1, rtol=1e-3)
+            self.assertLessEqual(torch.mean(torch.abs(depth_fast.float() - depth_slow.float())).item(), 5e-3)

From 98c0528b0fd5cc508698b51b92fef4b672a887ff Mon Sep 17 00:00:00 2001
From: Pritam Das <79273068+DeXtAr47-oss@users.noreply.github.com>
Date: Tue, 4 Nov 2025 21:15:02 +0530
Subject: [PATCH 49/56] add fuyu fast image processors (#41817)

* added fast processor for fuyu (#36978)

* updated docs for fuyu model (#36978)

* updated test_image_processing  and image_processing_fuyu_fast

* updated fuyu.md and image_processing_fuyu_fast (#36978)

* updated test_image_processing_fuyu (#36978)

* formatted image_processing_fuyu_fast and test_image_processing_fuyu (#36978)

* updated tests and fuyu fast image processing (#36978)

* Merge branch 'fuyu-fast-image-processors' of https://github.com/DeXtAr47-oss/transformers into fuyu-fast-image-processors

* fixed format (#36978)

* formatted files (#36978)

* formatted files

* revert unnecessary changes

* clean up and process by group

---------

Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
---
 docs/source/en/model_doc/fuyu.md              |   9 +-
 .../image_processing_utils_fast.py            |   9 +-
 .../models/auto/image_processing_auto.py      |   2 +-
 src/transformers/models/fuyu/__init__.py      |   1 +
 .../models/fuyu/image_processing_fuyu.py      |  18 +
 .../models/fuyu/image_processing_fuyu_fast.py | 382 +++++++++++++++
 .../models/fuyu/test_image_processing_fuyu.py | 461 ++++++++++++++++--
 7 files changed, 847 insertions(+), 35 deletions(-)
 create mode 100644 src/transformers/models/fuyu/image_processing_fuyu_fast.py

diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md
index 34202b022f7e..57f0de1eb244 100644
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
 from PIL import Image
 from transformers import AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
+from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast
 
 
 tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessor()
+image_processor = FuyuImageProcessorFast()
 
 
 processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@@ -118,6 +118,11 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 [[autodoc]] FuyuImageProcessor
     - __call__
 
+## FuyuImageProcessor
+
+[[autodoc]] FuyuImageProcessorFast
+    - __call__
+
 ## FuyuProcessor
 
 [[autodoc]] FuyuProcessor
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 6ffc8882c9bb..f675da162079 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -228,6 +228,7 @@ def pad(
         padding_mode: Optional[str] = "constant",
         return_mask: bool = False,
         disable_grouping: Optional[bool] = False,
+        is_nested: Optional[bool] = False,
         **kwargs,
     ) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
         """
@@ -258,7 +259,9 @@ def pad(
         else:
             pad_size = get_max_height_width(images)
 
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping, is_nested=is_nested
+        )
         processed_images_grouped = {}
         processed_masks_grouped = {}
         for shape, stacked_images in grouped_images.items():
@@ -281,9 +284,9 @@ def pad(
                 stacked_masks[..., : image_size[0], : image_size[1]] = 1
                 processed_masks_grouped[shape] = stacked_masks
 
-        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=is_nested)
         if return_mask:
-            processed_masks = reorder_images(processed_masks_grouped, grouped_images_index)
+            processed_masks = reorder_images(processed_masks_grouped, grouped_images_index, is_nested=is_nested)
             return processed_images, processed_masks
 
         return processed_images
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 4b0dd8e8d2ad..ed9fc376463f 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -104,7 +104,7 @@
             ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
             ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
-            ("fuyu", ("FuyuImageProcessor", None)),
+            ("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
             ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
             ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
diff --git a/src/transformers/models/fuyu/__init__.py b/src/transformers/models/fuyu/__init__.py
index c2a7d252010e..eca3cf7c411b 100644
--- a/src/transformers/models/fuyu/__init__.py
+++ b/src/transformers/models/fuyu/__init__.py
@@ -20,6 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_fuyu import *
     from .image_processing_fuyu import *
+    from .image_processing_fuyu_fast import *
     from .modeling_fuyu import *
     from .processing_fuyu import *
 else:
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 76c3b6130653..e86352af1bf5 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -29,6 +29,7 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
+    SizeDict,
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
@@ -37,6 +38,7 @@
     to_numpy_array,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -70,6 +72,21 @@ def make_list_of_list_of_images(
     raise ValueError("images must be a list of list of images or a list of images or an image.")
 
 
+class FuyuImagesKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
+        Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+    padding_value (`float`, *optional*, defaults to 1.0):
+        The value to pad the image with.
+    padding_mode (`str`, *optional*, defaults to "constant"):
+        The padding mode to use when padding the image.
+    """
+
+    patch_size: Optional[SizeDict]
+    padding_value: float
+    padding_mode: str
+
+
 class FuyuBatchFeature(BatchFeature):
     """
     BatchFeature class for Fuyu image processor and processor.
@@ -232,6 +249,7 @@ class FuyuImageProcessor(BaseImageProcessor):
         "image_patch_indices_per_batch",
         "image_patch_indices_per_subsequence",
     ]
+    valid_kwargs = FuyuImagesKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/fuyu/image_processing_fuyu_fast.py b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
new file mode 100644
index 000000000000..4c9c2802e8df
--- /dev/null
+++ b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
@@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Fuyu."""
+
+import math
+from typing import Optional, Union
+
+import torch
+
+from ...image_processing_utils import get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torchvision_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_fuyu import FuyuBatchFeature, FuyuImagesKwargs, make_list_of_list_of_images
+
+
+if is_torchvision_available():
+    from torchvision.transforms.v2 import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class FuyuImageProcessorFast(BaseImageProcessorFast):
+    do_resize = True
+    size = {"height": 1080, "width": 1920}
+    resample = PILImageResampling.BILINEAR
+    do_pad = True
+    padding_value = 1.0
+    padding_mode = "constant"
+    do_normalize = True
+    image_mean = 0.5
+    image_std = 0.5
+    do_rescale = True
+    rescale_factor = 1 / 255
+    model_input_names = [
+        "images",
+        "image_input_ids",
+        "image_patches",
+        "image_patch_indices_per_batch",
+        "image_patch_indices_per_subsequence",
+    ]
+    valid_kwargs = FuyuImagesKwargs
+
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+        expected_ndims: int = 3,
+    ) -> ImageInput:
+        images = self.fetch_images(images)
+        return make_list_of_list_of_images(images)
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize an image to fit within `(size["height"], size["width"])` while maintaining aspect ratio.
+        Only resizes if the image is larger than the target size.
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the max size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BILINEAR`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to apply antialiasing when resizing.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        image_height, image_width = image.shape[-2:]
+        target_height, target_width = size.height, size.width
+        # Only resize if image is larger than target
+        if image_width <= target_width and image_height <= target_height:
+            return image
+        # Calculate optimal scale factor to fit within target size
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        new_height = int(image_height * optimal_scale_factor)
+        new_width = int(image_width * optimal_scale_factor)
+
+        return super().resize(
+            image, SizeDict(height=new_height, width=new_width), interpolation=interpolation, antialias=antialias
+        )
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: Optional[bool],
+        padding_value: Optional[float],
+        padding_mode: Optional[str],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> FuyuBatchFeature:
+        # Group images by size for batched resizing
+        original_image_sizes = [batch_image[0].shape[-2:] for batch_image in images if batch_image]
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, disable_grouping=disable_grouping, is_nested=True
+        )
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index, is_nested=True)
+
+        image_sizes = [batch_image[0].shape[-2:] for batch_image in resized_images if batch_image]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+        image_scale_factors = [
+            [resized_size[0] / original_size[0]]
+            for original_size, resized_size in zip(original_image_sizes, image_sizes)
+        ]
+        if do_pad:
+            resized_images = self.pad(
+                resized_images,
+                pad_size=size,
+                fill_value=padding_value,
+                padding_mode=padding_mode,
+                disable_grouping=disable_grouping,
+                is_nested=True,
+            )
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, disable_grouping=disable_grouping, is_nested=True
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True)
+
+        return FuyuBatchFeature(
+            data={
+                "images": processed_images,
+                "image_unpadded_heights": image_unpadded_heights,
+                "image_unpadded_widths": image_unpadded_widths,
+                "image_scale_factors": image_scale_factors,
+            },
+            tensor_type=return_tensors,
+        )
+
+    def get_num_patches(self, image_height: int, image_width: int, patch_size: Optional[SizeDict] = None) -> int:
+        """
+        Calculate number of patches required to encode an image.
+        Args:
+            image_height (`int`):
+                Height of the image.
+            image_width (`int`):
+                Width of the image.
+            patch_size (`SizeDict`, *optional*):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        return num_patches
+
+    def patchify_image(self, image: torch.Tensor, patch_size: Optional[SizeDict] = None) -> torch.Tensor:
+        """
+        Convert an image into a tensor of patches using PyTorch's unfold operation.
+        Args:
+            image (`torch.Tensor`):
+                Image to convert. Shape: [batch, channels, height, width]
+            patch_size (`SizeDict`, *optional*):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        requires_backends(self, ["torch"])
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        batch_size, channels, _, _ = image.shape
+        # Use unfold to extract patches
+        unfolded_along_height = image.unfold(2, patch_height, patch_height)
+        patches = unfolded_along_height.unfold(3, patch_width, patch_width)
+        patches = patches.contiguous()
+        # Reshape to [batch, num_patches, channels * patch_h * patch_w]
+        patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
+        patches = patches.permute(0, 2, 3, 4, 1)
+        patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
+        return patches
+
+    def preprocess_with_tokenizer_info(
+        self,
+        image_input: torch.Tensor,
+        image_present: torch.Tensor,
+        image_unpadded_h: torch.Tensor,
+        image_unpadded_w: torch.Tensor,
+        image_placeholder_id: int,
+        image_newline_id: int,
+        variable_sized: bool,
+        patch_size: Optional[dict[str, int]] = None,
+    ) -> FuyuBatchFeature:
+        """
+        Process images for model input. In particular, variable-sized images are handled here.
+
+        Args:
+            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
+                Tensor of images padded to model input size.
+            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
+                Tensor of 1s and 0s indicating whether an image is present.
+            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image heights.
+            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image widths.
+            image_placeholder_id (int):
+                The id of the image placeholder token. Comes from an associated tokenizer.
+            image_newline_id (int):
+                The id of the image newline token. Comes from an associated tokenizer.
+            variable_sized (bool):
+                Whether to process images as variable-sized.
+            patch_size (`dict[str, int]`, *optional*):
+                Size of the patches.
+        """
+        requires_backends(self, ["torch"])
+
+        if patch_size is None:
+            patch_size = SizeDict(**self.patch_size)
+        else:
+            patch_size = SizeDict(**patch_size)
+        patch_height, patch_width = patch_size.height, patch_size.width
+        # Only images that are present
+        images: list[list[torch.Tensor]] = []
+        batch_image_patches: list[list[torch.Tensor]] = []
+        # Image input ids for every subsequence, including ones with no image present
+        batch_image_input_ids: list[list[torch.Tensor]] = []
+        for batch_index in range(image_input.shape[0]):
+            image_input_ids = []
+            image_patches = []
+            for subseq_index in range(image_input.shape[1]):
+                if image_present[batch_index, subseq_index]:
+                    image = image_input[batch_index, subseq_index]
+                    image_height, image_width = image.shape[1], image.shape[2]
+                    if variable_sized:
+                        # Calculate new dimensions based on unpadded size
+                        # The min() is required here due to floating point issues
+                        new_h = min(
+                            image_height,
+                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+                        )
+                        new_w = min(
+                            image_width,
+                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+                        )
+                        image = image[:, :new_h, :new_w]
+                        image_height, image_width = new_h, new_w
+                    num_patches = self.get_num_patches(
+                        image_height=image_height, image_width=image_width, patch_size=patch_size
+                    )
+                    # Create tensor of placeholder IDs
+                    tensor_of_image_ids = torch.full(
+                        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+                    )
+                    # Patchify the image
+                    patches = self.patchify_image(image=image.unsqueeze(0), patch_size=patch_size).squeeze(0)
+                    assert num_patches == patches.shape[0]
+                    if variable_sized:
+                        # Terminate each line with newline ID
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
+                        newline_ids = torch.full(
+                            [tensor_of_image_ids.shape[0], 1],
+                            image_newline_id,
+                            dtype=torch.int32,
+                            device=image_input.device,
+                        )
+                        tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
+                    images.append([image])
+                    image_input_ids.append(tensor_of_image_ids)
+                    image_patches.append(patches)
+                else:
+                    image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+            batch_image_input_ids.append(image_input_ids)
+            batch_image_patches.append(image_patches)
+        # Create image patch indices
+        image_patch_indices_per_batch: list[list[torch.Tensor]] = []
+        image_patch_indices_per_subsequence: list[list[torch.Tensor]] = []
+
+        for sample_image_input_ids in batch_image_input_ids:
+            index_offset = 0
+            per_batch_indices = []
+            per_subsequence_indices = []
+            for subseq_image_input_ids in sample_image_input_ids:
+                # Indices of image patches
+                patches_mask = subseq_image_input_ids == image_placeholder_id
+                num_patches = torch.count_nonzero(patches_mask)
+                indices = torch.arange(num_patches, dtype=torch.int64, device=subseq_image_input_ids.device).type_as(
+                    subseq_image_input_ids
+                )
+                # Place those indices in the image input ids token stream, with -1 representing non-index tokens
+                indices_in_stream_per_batch = torch.full_like(subseq_image_input_ids, -1)
+                indices_in_stream_per_subsequence = torch.full_like(subseq_image_input_ids, -1)
+                patches_inds = torch.nonzero(patches_mask, as_tuple=True)[0]
+
+                indices_in_stream_per_batch[patches_inds] = indices + index_offset
+                indices_in_stream_per_subsequence[patches_inds] = indices
+
+                per_batch_indices.append(indices_in_stream_per_batch)
+                per_subsequence_indices.append(indices_in_stream_per_subsequence)
+                index_offset += num_patches
+
+            image_patch_indices_per_batch.append(per_batch_indices)
+            image_patch_indices_per_subsequence.append(per_subsequence_indices)
+        return FuyuBatchFeature(
+            data={
+                "images": images,
+                "image_input_ids": batch_image_input_ids,
+                "image_patches": batch_image_patches,
+                "image_patch_indices_per_batch": image_patch_indices_per_batch,
+                "image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
+            }
+        )
+
+    def _further_process_kwargs(
+        self,
+        patch_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Process Fuyu-specific kwargs before validation.
+        """
+        kwargs = super()._further_process_kwargs(**kwargs)
+        if patch_size is not None:
+            patch_size = SizeDict(**get_size_dict(patch_size, param_name="patch_size"))
+        kwargs["patch_size"] = patch_size
+        return kwargs
+
+
+__all__ = ["FuyuImageProcessorFast"]
diff --git a/tests/models/fuyu/test_image_processing_fuyu.py b/tests/models/fuyu/test_image_processing_fuyu.py
index fd9fea1f741a..24b19b01a029 100644
--- a/tests/models/fuyu/test_image_processing_fuyu.py
+++ b/tests/models/fuyu/test_image_processing_fuyu.py
@@ -1,63 +1,466 @@
+import io
 import unittest
 
+import httpx
 import numpy as np
+import pytest
+from packaging import version
 
-from transformers import is_torch_available, is_vision_available
+from transformers.image_utils import SizeDict
 from transformers.testing_utils import (
     require_torch,
+    require_torch_accelerator,
     require_torchvision,
     require_vision,
+    slow,
+    torch_device,
 )
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
 
 
 if is_torch_available() and is_vision_available():
     import torch
 
-    from transformers import FuyuImageProcessor
+    from transformers import FuyuImageProcessor, FuyuImageProcessorFast
 
 if is_vision_available():
     from PIL import Image
 
 
+class FuyuImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        patch_size=None,
+    ):
+        size = size if size is not None else {"height": 180, "width": 360}
+        patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = 30
+        self.max_resolution = 360
+        self.do_resize = do_resize
+        self.size = size
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.patch_size = patch_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "patch_size": self.patch_size,
+        }
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """Prepares a batch of images for testing"""
+        if equal_resolution:
+            image_inputs = [
+                np.random.randint(
+                    0, 256, (self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                )
+                for _ in range(self.batch_size)
+            ]
+        else:
+            heights = [
+                h - (h % 30) for h in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+            widths = [
+                w - (w % 30) for w in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+
+            image_inputs = [
+                np.random.randint(0, 256, (self.num_channels, height, width), dtype=np.uint8)
+                for height, width in zip(heights, widths)
+            ]
+
+        if not numpify and not torchify:
+            image_inputs = [Image.fromarray(np.moveaxis(img, 0, -1)) for img in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(img) for img in image_inputs]
+
+        return image_inputs
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+
 @require_torch
 @require_vision
 @require_torchvision
-class TestFuyuImageProcessor(unittest.TestCase):
+class FuyuImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = FuyuImageProcessor
+    fast_image_processing_class = FuyuImageProcessorFast
+
+    # Skip tests that expect pixel_values output
+    test_cast_dtype = None
+
     def setUp(self):
-        self.size = {"height": 160, "width": 320}
-        self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0)
-        self.batch_size = 3
-        self.channels = 3
-        self.height = 300
-        self.width = 300
+        self.image_processor_tester = FuyuImageProcessingTester(self)
+        self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
 
-        self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width)
+        # Initialize image_processor_list (from ImageProcessingTestMixin)
+        image_processor_list = []
+        if self.test_slow_image_processor and self.image_processing_class:
+            image_processor_list.append(self.image_processing_class)
+        if self.test_fast_image_processor and self.fast_image_processing_class:
+            image_processor_list.append(self.fast_image_processing_class)
+        self.image_processor_list = image_processor_list
 
-        self.image_patch_dim_h = 30
-        self.image_patch_dim_w = 30
-        self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
-        self.sample_image_pil = Image.fromarray(self.sample_image)
+    def test_call_pil(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
 
-    def test_patches(self):
-        expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_pytorch(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
 
-        patches_final = self.processor.patchify_image(image=self.image_input)
-        assert patches_final.shape[1] == expected_num_patches, (
-            f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy_4_channels(self):
+        """Skip this test as Fuyu doesn't support arbitrary channels"""
+        self.skipTest("Fuyu processor is designed for 3-channel RGB images")
+
+    def test_slow_fast_equivalence(self):
+        """Override to handle Fuyu's custom output structure"""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+        dummy_image = Image.open(
+            io.BytesIO(
+                httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
+            )
         )
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.images[0][0], encoding_fast.images[0][0])
+
+    def test_slow_fast_equivalence_batched(self):
+        """Override to handle Fuyu's custom output structure"""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        # Compare each image tensor
+        for slow_img, fast_img in zip(encoding_slow.images, encoding_fast.images):
+            self._assert_slow_fast_tensors_equivalence(slow_img[0], fast_img[0])
+
+    @slow
+    @require_torch_accelerator
+    @require_vision
+    @pytest.mark.torch_compile_test
+    def test_can_compile_fast_image_processor(self):
+        if self.fast_image_processing_class is None:
+            self.skipTest("Skipping compilation test as fast image processor is not defined")
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        torch.compiler.reset()
+        input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+        image_processor = self.fast_image_processing_class(**self.image_processor_dict)
+        output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+        image_processor = torch.compile(image_processor, mode="reduce-overhead")
+        output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+        self._assert_slow_fast_tensors_equivalence(
+            output_eager.images[0][0], output_compiled.images[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
+        )
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_pad"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_rescale"))
+            self.assertTrue(hasattr(image_processor, "rescale_factor"))
+            self.assertTrue(hasattr(image_processor, "patch_size"))
+
+    def test_patches(self):
+        """Test that patchify_image produces the expected number of patches."""
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            batch_size = 3
+            channels = 3
+            height = 300
+            width = 300
+            image_input = torch.rand(batch_size, channels, height, width)
+
+            expected_num_patches = image_processor.get_num_patches(image_height=height, image_width=width)
+            patches_final = image_processor.patchify_image(image=image_input)
+
+            self.assertEqual(patches_final.shape[1], expected_num_patches)
+
+    def test_patches_match_slow_fast(self):
+        """Test that fast processor produces same patches as slow processor."""
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast patch equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(
+                reason="Skipping slow/fast patch equivalence test as one of the image processors is not defined"
+            )
+
+        batch_size = 3
+        channels = 3
+        height = 300
+        width = 300
+        image_input = torch.rand(batch_size, channels, height, width)
+
+        processor_slow = self.image_processing_class(**self.image_processor_dict)
+        processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        patches_fast = processor_fast.patchify_image(image=image_input)
+        patches_slow = processor_slow.patchify_image(image=image_input)
+
+        self.assertEqual(patches_fast.shape, patches_slow.shape)
+        torch.testing.assert_close(patches_fast, patches_slow, rtol=1e-4, atol=1e-4)
 
     def test_scale_to_target_aspect_ratio(self):
-        # (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
-        scaled_image = self.processor.resize(self.sample_image, size=self.size)
-        self.assertEqual(scaled_image.shape[0], 160)
-        self.assertEqual(scaled_image.shape[1], 74)
+        """Test that resize maintains aspect ratio correctly."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        if self.test_slow_image_processor and self.image_processing_class:
+            image_processor = self.image_processing_class(**self.image_processor_dict)
+            scaled_image = image_processor.resize(sample_image, size=self.image_processor_dict["size"])
+            self.assertEqual(scaled_image.shape[0], 180)
+            self.assertEqual(scaled_image.shape[1], 84)
+
+        if self.test_fast_image_processor and self.fast_image_processing_class:
+            image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+            sample_tensor = torch.from_numpy(sample_image).permute(2, 0, 1).float()
+
+            size_dict = SizeDict(
+                height=self.image_processor_dict["size"]["height"], width=self.image_processor_dict["size"]["width"]
+            )
+            scaled_image = image_processor_fast.resize(sample_tensor, size=size_dict)
+
+            self.assertEqual(scaled_image.shape[1], 180)
+            self.assertEqual(scaled_image.shape[2], 84)
 
     def test_apply_transformation_numpy(self):
-        transformed_image = self.processor.preprocess(self.sample_image).images[0][0]
-        self.assertEqual(transformed_image.shape[1], 160)
-        self.assertEqual(transformed_image.shape[2], 320)
+        """Test preprocessing with numpy input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)
 
     def test_apply_transformation_pil(self):
-        transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0]
-        self.assertEqual(transformed_image.shape[1], 160)
-        self.assertEqual(transformed_image.shape[2], 320)
+        """Test preprocessing with PIL input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image_pil).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)
+
+    def test_preprocess_output_structure(self):
+        """Test that preprocess returns correct output structure."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(sample_image)
+
+            self.assertIn("images", result)
+            self.assertIn("image_unpadded_heights", result)
+            self.assertIn("image_unpadded_widths", result)
+            self.assertIn("image_scale_factors", result)
+
+            self.assertEqual(len(result.images), 1)
+            self.assertEqual(len(result.images[0]), 1)
+            self.assertEqual(len(result.image_unpadded_heights), 1)
+            self.assertEqual(len(result.image_unpadded_widths), 1)
+            self.assertEqual(len(result.image_scale_factors), 1)
+
+    def test_batch_processing(self):
+        """Test processing multiple images."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+        images = [sample_image, sample_image_pil]
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(images)
+
+            self.assertEqual(len(result.images), 2)
+            for img in result.images:
+                self.assertEqual(len(img), 1)
+                if hasattr(img[0], "shape"):
+                    if len(img[0].shape) == 3:
+                        self.assertEqual(img[0].shape[1], 180)
+                        self.assertEqual(img[0].shape[2], 360)
+
+    def test_pad_image_fast(self):
+        """Test that padding works correctly for fast processor."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        from transformers.image_utils import SizeDict
+
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 100)
+        size_dict = SizeDict(height=180, width=360)
+
+        padded = image_processor_fast.pad([small_image], pad_size=size_dict, fill_value=1.0)[0]
+        self.assertEqual(padded.shape[1], 180)
+        self.assertEqual(padded.shape[2], 360)
+
+        self.assertTrue(torch.allclose(padded[:, 100:, :], torch.ones_like(padded[:, 100:, :])))
+        self.assertTrue(torch.allclose(padded[:, :, 100:], torch.ones_like(padded[:, :, 100:])))
+
+    def test_preprocess_with_tokenizer_info(self):
+        """Test preprocess_with_tokenizer_info functionality."""
+        batch_size = 2
+        subseq_size = 1
+        channels = 3
+        image_input = torch.rand(batch_size, subseq_size, channels, 180, 360)
+        image_present = torch.ones(batch_size, subseq_size, dtype=torch.bool)
+        image_unpadded_h = torch.tensor([[180], [180]])
+        image_unpadded_w = torch.tensor([[360], [360]])
+
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+
+            result = image_processor.preprocess_with_tokenizer_info(
+                image_input=image_input,
+                image_present=image_present,
+                image_unpadded_h=image_unpadded_h,
+                image_unpadded_w=image_unpadded_w,
+                image_placeholder_id=100,
+                image_newline_id=101,
+                variable_sized=True,
+            )
+
+            # Check output structure
+            self.assertIn("images", result)
+            self.assertIn("image_input_ids", result)
+            self.assertIn("image_patches", result)
+            self.assertIn("image_patch_indices_per_batch", result)
+            self.assertIn("image_patch_indices_per_subsequence", result)
+
+            # Check batch structure
+            self.assertEqual(len(result.images), batch_size)
+            self.assertEqual(len(result.image_input_ids), batch_size)
+            self.assertEqual(len(result.image_patches), batch_size)
+
+    def test_device_handling_fast(self):
+        """Test that fast processor can handle device placement."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        if torch.cuda.is_available():
+            result_cuda = image_processor_fast.preprocess(sample_image, device="cuda")
+            self.assertEqual(result_cuda.images[0][0].device.type, "cuda")
+
+        result_cpu = image_processor_fast.preprocess(sample_image, device="cpu")
+        self.assertEqual(result_cpu.images[0][0].device.type, "cpu")
+
+    def test_do_not_resize_if_smaller(self):
+        """Test that images smaller than target size are not resized."""
+        if not self.test_fast_image_processor or self.fast_image_processing_class is None:
+            self.skipTest(reason="Fast processor not available")
+
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 150)
+        size_dict = SizeDict(height=180, width=360)
+
+        resized = image_processor_fast.resize(small_image, size=size_dict)
+
+        self.assertEqual(resized.shape[1], 100)
+        self.assertEqual(resized.shape[2], 150)

From 8ec843675c7c173c78a281c98b3c006fe3145bd0 Mon Sep 17 00:00:00 2001
From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
Date: Tue, 4 Nov 2025 16:59:07 +0100
Subject: [PATCH 50/56] [kernels] Fix XPU layernorm kernel (#41583)

* fix

* add comment

* better fix

* style

* Update src/transformers/modeling_utils.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/integrations/hub_kernels.py | 21 +++++++++++++++++++-
 src/transformers/modeling_utils.py           |  6 +++++-
 src/transformers/utils/import_utils.py       |  6 ++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
index a64e156bacf4..112ac670e9a0 100644
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@@ -19,6 +19,7 @@
 
 from ..modeling_flash_attention_utils import lazy_import_flash_attention
 from ..utils import logging
+from ..utils.import_utils import is_kernels_available
 from .flash_attention import flash_attention_forward
 
 
@@ -64,6 +65,12 @@
                     layer_name="LigerRMSNorm",
                 )
             },
+            "xpu": {
+                Mode.INFERENCE: LayerRepository(
+                    repo_id="kernels-community/rmsnorm",
+                    layer_name="RMSNorm",
+                )
+            },
         },
         "MLP": {
             "cuda": LayerRepository(
@@ -139,7 +146,18 @@
         },
     }
 
-    register_kernel_mapping(_KERNEL_MAPPING)
+    def has_key(d, key):
+        return key in d or any(isinstance(v, dict) and has_key(v, key) for v in d.values())
+
+    def register_kernel_mapping_transformers(mapping=None):
+        if mapping is None:
+            mapping = _KERNEL_MAPPING
+        if has_key(mapping, "xpu") and not is_kernels_available(MIN_VERSION="0.10.2"):
+            raise ImportError(
+                "kernels uses an incompatible version. Please install the latest version with `pip install -U kernels`."
+            )
+        register_kernel_mapping(mapping)
+
 
 except ImportError:
     _kernels_available = False
@@ -283,6 +301,7 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, Optional[ModuleType]]
     "LayerRepository",
     "use_kernel_forward_from_hub",
     "register_kernel_mapping",
+    "register_kernel_mapping_transformers",
     "replace_kernel_forward_from_hub",
     "lazy_load_kernel",
 ]
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 794c238c0fa7..5f5f7ba62292 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4033,10 +4033,14 @@ def set_use_kernels(self, use_kernels, kernel_config):
         if use_kernels:
             if not is_kernels_available():
                 raise ValueError(
-                    "Kernels are not available. To use kernels, please install kernels using `pip install kernels`"
+                    "`use_kernels=True` requires kernels>=0.9.0. Please install the latest version with `pip install -U kernels`"
                 )
             from kernels import use_kernel_mapping
 
+            from .integrations.hub_kernels import register_kernel_mapping_transformers
+
+            register_kernel_mapping_transformers()
+
             if kernel_config is not None and isinstance(kernel_config, KernelConfig):
                 # This will make sure the mapping is valid, and the layers are registered in the model
                 kernel_config.sanitize_kernel_mapping(self)
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 7d939ed34e6f..f27d998f2ac7 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -87,6 +87,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[
 TORCHAO_MIN_VERSION = "0.4.0"
 AUTOROUND_MIN_VERSION = "0.5.0"
 TRITON_MIN_VERSION = "1.0.0"
+KERNELS_MIN_VERSION = "0.9.0"
 
 
 @lru_cache
@@ -513,8 +514,9 @@ def is_kenlm_available() -> bool:
 
 
 @lru_cache
-def is_kernels_available() -> bool:
-    return _is_package_available("kernels")
+def is_kernels_available(MIN_VERSION: str = KERNELS_MIN_VERSION) -> bool:
+    is_available, kernels_version = _is_package_available("kernels", return_version=True)
+    return is_available and version.parse(kernels_version) >= version.parse(MIN_VERSION)
 
 
 @lru_cache

From a63b6da1507ec8730b3f2797409eb8da097ea0a4 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 4 Nov 2025 16:47:06 +0000
Subject: [PATCH 51/56] [v5] Deprecate Text2Text and related pipelines (#41996)

* Deprecate Text2Text and related pipelines

* Try a restructure

* make fixup

* logging -> logger
---
 src/transformers/pipelines/__init__.py        |  2 +-
 .../pipelines/deprecated/__init__.py          | 16 ++++++++++
 .../{ => deprecated}/text2text_generation.py  | 32 ++++++++++++++++---
 3 files changed, 44 insertions(+), 6 deletions(-)
 create mode 100644 src/transformers/pipelines/deprecated/__init__.py
 rename src/transformers/pipelines/{ => deprecated}/text2text_generation.py (92%)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 87db1981cbc0..5b8e3f6b221c 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -59,6 +59,7 @@
     get_default_model_and_revision,
     load_model,
 )
+from .deprecated import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .depth_estimation import DepthEstimationPipeline
 from .document_question_answering import DocumentQuestionAnsweringPipeline
 from .feature_extraction import FeatureExtractionPipeline
@@ -74,7 +75,6 @@
 from .object_detection import ObjectDetectionPipeline
 from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
 from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
-from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .text_classification import TextClassificationPipeline
 from .text_generation import TextGenerationPipeline
 from .text_to_audio import TextToAudioPipeline
diff --git a/src/transformers/pipelines/deprecated/__init__.py b/src/transformers/pipelines/deprecated/__init__.py
new file mode 100644
index 000000000000..c0e31392f272
--- /dev/null
+++ b/src/transformers/pipelines/deprecated/__init__.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/deprecated/text2text_generation.py
similarity index 92%
rename from src/transformers/pipelines/text2text_generation.py
rename to src/transformers/pipelines/deprecated/text2text_generation.py
index 2e9e61691442..54e1b2873041 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/deprecated/text2text_generation.py
@@ -2,14 +2,14 @@
 import warnings
 from typing import Any
 
-from ..generation import GenerationConfig
-from ..tokenization_utils import TruncationStrategy
-from ..utils import add_end_docstrings, is_torch_available, logging
-from .base import Pipeline, build_pipeline_init_args
+from ...generation import GenerationConfig
+from ...tokenization_utils import TruncationStrategy
+from ...utils import add_end_docstrings, is_torch_available, logging
+from ..base import Pipeline, build_pipeline_init_args
 
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+    from ...models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -77,6 +77,12 @@ class Text2TextGenerationPipeline(Pipeline):
     return_name = "generated"
 
     def __init__(self, *args, **kwargs):
+        if self.return_name == "generated":  # Check this isn't summarization/translation instead
+            logger.warning_once(
+                "The `Text2TextGenerationPipeline` is deprecated and no longer maintained. For most "
+                "purposes, we recommend using newer models with causal pipelines like "
+                "`TextGenerationPipeline` or `ImageTextToTextPipeline`."
+            )
         super().__init__(*args, **kwargs)
 
         self.check_model_type(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
@@ -254,6 +260,14 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
     # Used in the return key of the pipeline.
     return_name = "summary"
 
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "The `SummarizationPipeline` is deprecated and no longer maintained. For most "
+            "summarization tasks, we recommend appropriately prompting modern general-purpose LLMs "
+            "via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
+        )
+        super().__init__(*args, **kwargs)
+
     def __call__(self, *args, **kwargs):
         r"""
         Summarize the text(s) given as inputs.
@@ -323,6 +337,14 @@ class TranslationPipeline(Text2TextGenerationPipeline):
     # Used in the return key of the pipeline.
     return_name = "translation"
 
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "The `TranslationPipeline` is deprecated and no longer maintained. For most "
+            "translation tasks, we recommend appropriately prompting modern general-purpose LLMs "
+            "via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
+        )
+        super().__init__(*args, **kwargs)
+
     def check_inputs(self, input_length: int, min_length: int, max_new_tokens: int):
         """
         Removed input length check - unnecessary with max_new_tokens (previously relevant for max_length)

From a3f3937b2ff0c1963e8ca998801c6122d704e472 Mon Sep 17 00:00:00 2001
From: Andrei Panferov <andrei@panferov.org>
Date: Tue, 4 Nov 2025 19:52:47 +0300
Subject: [PATCH 52/56] [FPQuant] MXFP8 and MXFP4 backwards support (#41897)

* FP-Quant backwards

* fp-quant v0.3.0 docker

* availability version bump

* fp_quant==0.3.1

* fp_quant v0.3.2
---
 docker/transformers-quantization-latest-gpu/Dockerfile   | 2 +-
 src/transformers/integrations/fp_quant.py                | 4 ++++
 src/transformers/utils/import_utils.py                   | 4 ++--
 src/transformers/utils/quantization_config.py            | 8 ++++++--
 tests/quantization/fp_quant_integration/test_fp_quant.py | 7 +++++++
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 723e1bab07be..3d00eaa938e0 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
 RUN cd transformers && python3 setup.py develop
 
 # Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"
 
 # Low usage or incompatible lib, will enable later on
 
diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index 0ac441e36f93..ccf933796165 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -35,6 +35,10 @@ def adapt_fp_quant_config(config: FPQuantConfig):
 
     if config.backward_dtype == "bf16":
         backward_dtype = FPQuantDtype.BF16
+    elif config.backward_dtype == "mxfp8":
+        backward_dtype = FPQuantDtype.MXFP8
+    elif config.backward_dtype == "mxfp4":
+        backward_dtype = FPQuantDtype.MXFP4
     else:
         raise ValueError(f"Unsupported backward dtype: {config.backward_dtype}")
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index f27d998f2ac7..3338d01bb2dc 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -973,13 +973,13 @@ def is_quark_available() -> bool:
 @lru_cache
 def is_fp_quant_available():
     is_available, fp_quant_version = _is_package_available("fp_quant", return_version=True)
-    return is_available and version.parse(fp_quant_version) >= version.parse("0.2.0")
+    return is_available and version.parse(fp_quant_version) >= version.parse("0.3.2")
 
 
 @lru_cache
 def is_qutlass_available():
     is_available, qutlass_version = _is_package_available("qutlass", return_version=True)
-    return is_available and version.parse(qutlass_version) >= version.parse("0.1.0")
+    return is_available and version.parse(qutlass_version) >= version.parse("0.2.0")
 
 
 @lru_cache
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 12b4da8927d9..8ce450a60b9b 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1601,8 +1601,12 @@ def post_init(self):
         else:
             raise ValueError("Only 'mxfp4' and 'nvfp4' are supported for forward_dtype for now.")
 
-        if self.backward_dtype != "bf16":
-            raise ValueError("Only 'bf16' is supported for backward_dtype for now.")
+        if self.backward_dtype not in ["bf16", "mxfp8", "mxfp4"]:
+            raise ValueError("Only 'bf16', 'mxfp8' and 'mxfp4' are supported for backward_dtype for now.")
+
+        if self.backward_dtype != "bf16" and self.forward_dtype != "mxfp4":
+            raise ValueError("Only 'mxfp4' forward is compatible with non-bf16 backwards for now.")
+
         if self.transform_init not in ["hadamard", "identity", "gsr"]:
             raise ValueError("Only 'hadamard', 'identity' and 'gsr' are supported for transform_init.")
 
diff --git a/tests/quantization/fp_quant_integration/test_fp_quant.py b/tests/quantization/fp_quant_integration/test_fp_quant.py
index ce73f0d80a63..6ae0e7f23deb 100644
--- a/tests/quantization/fp_quant_integration/test_fp_quant.py
+++ b/tests/quantization/fp_quant_integration/test_fp_quant.py
@@ -163,6 +163,13 @@ def getQuantizationConfig(cls):
         return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=False)
 
 
+@require_qutlass
+class FPQuantNVFP4Test(FPQuantBaseTest):
+    @classmethod
+    def getQuantizationConfig(cls):
+        return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=False)
+
+
 @require_qutlass
 class FPQuantMXFP4GS128Test(FPQuantBaseTest):
     @classmethod

From 09d5527474948b1f7e1a699aef6cf103c90e5023 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 6 Nov 2025 18:45:10 +0000
Subject: [PATCH 53/56] add working auto_docstring for processors

---
 .../models/clip/processing_clip.py            |  35 +-
 .../models/emu3/image_processing_emu3.py      |   7 +
 .../models/emu3/processing_emu3.py            |  43 +--
 src/transformers/utils/auto_docstring.py      | 300 +++++++++++++++++-
 utils/check_docstrings.py                     |  21 ++
 5 files changed, 344 insertions(+), 62 deletions(-)

diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 9258d2e8fee3..b57e1f7b3009 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -16,25 +16,32 @@
 Image/Text processor class for CLIP
 """
 
-from ...processing_utils import ProcessorMixin
+from typing import Optional, Union
 
+from ...audio_utils import AudioInput
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import auto_docstring
+from ...video_utils import VideoInput
 
-class CLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
-
-    [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`AutoTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
 
+@auto_docstring
+class CLIPProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        videos: Optional[VideoInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ) -> BatchFeature:
+        return super().__call__(images, text, videos, audio, **kwargs)
+
 
 __all__ = ["CLIPProcessor"]
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index 0c550937581f..735046dd9390 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -48,6 +48,13 @@
 
 
 class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    ratio (`str`, *optional*, defaults to `"1:1"`):
+        The ratio of the image to resize the image.
+    image_area (`int`, *optional*, defaults to `518400`):
+        The area of the image to resize the image.
+    """
+
     ratio: str
     image_area: int
 
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index 52f39a913c54..c7355c67effa 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -22,11 +22,11 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import is_vision_available
+from ...utils import auto_docstring, is_vision_available
 
 
 if is_vision_available():
-    from .image_processing_emu3 import smart_resize
+    from .image_processing_emu3 import Emu3ImageProcessorKwargs, smart_resize
 
 
 class Emu3TextKwargs(TextKwargs, total=False):
@@ -35,6 +35,7 @@ class Emu3TextKwargs(TextKwargs, total=False):
 
 class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
     text_kwargs: Emu3TextKwargs
+    images_kwargs: Emu3ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "return_for_image_generation": False,
@@ -47,23 +48,8 @@ class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Emu3Processor(ProcessorMixin):
-    r"""
-    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
-    processor.
-
-    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
-    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Emu3ImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`Emu3TokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -81,6 +67,7 @@ def __init__(
         self.downsample_ratio = 8
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -88,26 +75,6 @@ def __call__(
         **kwargs: Unpack[Emu3ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index b67d9fa11760..a4d7c3b68517 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -45,6 +45,7 @@
 
 PLACEHOLDER_TO_AUTO_MODULE = {
     "image_processor_class": ("image_processing_auto", "IMAGE_PROCESSOR_MAPPING_NAMES"),
+    "tokenizer_class": ("tokenization_auto", "TOKENIZER_MAPPING_NAMES"),
     "video_processor_class": ("video_processing_auto", "VIDEO_PROCESSOR_MAPPING_NAMES"),
     "feature_extractor_class": ("feature_extraction_auto", "FEATURE_EXTRACTOR_MAPPING_NAMES"),
     "processor_class": ("processing_auto", "PROCESSOR_MAPPING_NAMES"),
@@ -53,10 +54,12 @@
 
 UNROLL_KWARGS_METHODS = {
     "preprocess",
+    "__call__",
 }
 
 UNROLL_KWARGS_CLASSES = {
     "ImageProcessorFast",
+    "ProcessorMixin",
 }
 
 HARDCODED_CONFIG_FOR_MODELS = {
@@ -249,6 +252,68 @@ class ImageProcessorArgs:
     }
 
 
+class ProcessorArgs:
+    # __init__ arguments
+    image_processor = {
+        "description": """
+    The image processor is a required input.
+    """,
+        "type": "{image_processor_class}",
+    }
+
+    tokenizer = {
+        "description": """
+    The tokenizer is a required input.
+    """,
+        "type": "{tokenizer_class}",
+    }
+
+    video_processor = {
+        "description": """
+    The video processor is a required input.
+    """,
+        "type": "{video_processor_class}",
+    }
+
+    audio_processor = {
+        "description": """
+    The audio processor is a required input.
+    """,
+        "type": "{audio_processor_class}",
+    }
+
+    feature_extractor = {
+        "description": """
+    The feature extractor is a required input.
+    """,
+        "type": "{feature_extractor_class}",
+    }
+
+    chat_template = {
+        "description": """
+    A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+    """,
+        "type": "str",
+    }
+
+    # __call__ arguments
+    text = {
+        "description": """
+    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+    """,
+    }
+
+    audio = {
+        "description": """
+    The audio or batch of audios to be prepared. Each audio can be a NumPy array or PyTorch tensor.
+    In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+    and T is the sample length of the audio.
+    """,
+    }
+
+
 class ModelArgs:
     labels = {
         "description": """
@@ -1130,6 +1195,63 @@ def get_model_name(obj):
     return "model"
 
 
+def generate_processor_intro(cls) -> str:
+    """
+    Generate the intro docstring for a processor class based on its attributes.
+
+    Args:
+        cls: Processor class to generate intro for
+
+    Returns:
+        str: Generated intro text
+    """
+    class_name = cls.__name__
+
+    # Get attributes and their corresponding class names
+    attributes = cls.get_attributes()
+    if not attributes:
+        return ""
+
+    # Build list of component names and their classes
+    components = []
+    component_classes = []
+
+    for attr in attributes:
+        # Get the class name for this attribute
+        class_attr = f"{attr}_class"
+        # Format attribute name for display
+        attr_display = attr.replace("_", " ")
+        components.append(attr_display)
+        component_classes.append(f"[`{{{class_attr}}}`]")
+    if not components:
+        return ""
+
+    # Generate the intro text
+    if len(components) == 1:
+        components_text = f"a {components[0]}"
+        classes_text = component_classes[0]
+        classes_text_short = component_classes[0].replace("[`", "[`~")
+    elif len(components) == 2:
+        components_text = f"a {components[0]} and a {components[1]}"
+        classes_text = f"{component_classes[0]} and {component_classes[1]}"
+        classes_text_short = (
+            f"{component_classes[0].replace('[`', '[`~')} and {component_classes[1].replace('[`', '[`~')}"
+        )
+    else:
+        components_text = ", ".join(f"a {c}" for c in components[:-1]) + f", and a {components[-1]}"
+        classes_text = ", ".join(component_classes[:-1]) + f", and {component_classes[-1]}"
+        classes_short = [c.replace("[`", "[`~") for c in component_classes]
+        classes_text_short = ", ".join(classes_short[:-1]) + f", and {classes_short[-1]}"
+
+    intro = f"""Constructs a {class_name} which wraps {components_text} into a single processor.
+
+[`{class_name}`] offers all the functionalities of {classes_text}. See the
+{classes_text_short} for more information.
+"""
+
+    return intro
+
+
 def get_placeholders_dict(placeholders: list, model_name: str) -> dict:
     """
     Get the dictionary of placeholders for the given model name.
@@ -1151,7 +1273,9 @@ def get_placeholders_dict(placeholders: list, model_name: str) -> dict:
                 place_holder_value = None
             if place_holder_value is not None:
                 if isinstance(place_holder_value, (list, tuple)):
-                    place_holder_value = place_holder_value[0]
+                    place_holder_value = (
+                        place_holder_value[-1] if place_holder_value[-1] is not None else place_holder_value[0]
+                    )
                 placeholders_dict[placeholder] = place_holder_value if place_holder_value is not None else placeholder
             else:
                 placeholders_dict[placeholder] = placeholder
@@ -1342,13 +1466,14 @@ def _get_parameter_info(param_name, documented_params, source_args_dict, param_t
         ):
             param_type = documented_params[param_name]["type"]
         optional = documented_params[param_name]["optional"]
-        shape = documented_params[param_name]["shape"]
+        shape = documented_params[param_name].get("shape", None)
         shape_string = shape if shape else ""
         additional_info = documented_params[param_name]["additional_info"] or ""
         description = f"{documented_params[param_name]['description']}\n"
     elif param_name in source_args_dict:
         # Parameter is documented in ModelArgs or ImageProcessorArgs
-        shape = source_args_dict[param_name]["shape"]
+        param_type = source_args_dict[param_name].get("type", param_type)
+        shape = source_args_dict[param_name].get("shape", None)
         shape_string = " " + shape if shape else ""
         description = source_args_dict[param_name]["description"]
         additional_info = source_args_dict[param_name].get("additional_info", None)
@@ -1376,9 +1501,23 @@ def _process_regular_parameters(
         undocumented_parameters (`list`): List to append undocumented parameters to
     """
     docstring = ""
-    source_args_dict = (
-        get_args_doc_from_source([ModelArgs, ImageProcessorArgs]) if source_args_dict is None else source_args_dict
-    )
+    # Check if this is a processor (check both parent_class and class_name for "Processor")
+    is_processor = False
+    if parent_class is not None:
+        is_processor = "ProcessorMixin" in parent_class.__name__ or any(
+            "ProcessorMixin" in base.__name__ for base in parent_class.__mro__
+        )
+    elif class_name and "Processor" in class_name:
+        # When decorating methods directly, check if class name suggests it's a processor
+        is_processor = True
+
+    # Use appropriate args source based on whether it's a processor or not
+    if source_args_dict is None:
+        if is_processor:
+            source_args_dict = get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs])
+        else:
+            source_args_dict = get_args_doc_from_source([ModelArgs, ImageProcessorArgs])
+
     missing_args = {}
 
     for param_name, param in sig.parameters.items():
@@ -1467,7 +1606,22 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         undocumented_parameters (`list`): List to append undocumented parameters to
     """
     docstring = ""
-    source_args_dict = get_args_doc_from_source(ImageProcessorArgs)
+
+    # Check if this is a processor (has ProcessorMixin in parent class hierarchy or class name contains "Processor")
+    is_processor = False
+    if parent_class is not None:
+        is_processor = "ProcessorMixin" in parent_class.__name__ or any(
+            "ProcessorMixin" in base.__name__ for base in parent_class.__mro__
+        )
+    # Also check by function's qualified name when decorating methods directly
+    elif "Processor" in func.__qualname__:
+        is_processor = True
+
+    # Use appropriate args source based on whether it's a processor or not
+    if is_processor:
+        source_args_dict = get_args_doc_from_source([ImageProcessorArgs, ProcessorArgs])
+    else:
+        source_args_dict = get_args_doc_from_source(ImageProcessorArgs)
 
     # Check if we need to add typed kwargs description to the docstring
     unroll_kwargs = func.__name__ in UNROLL_KWARGS_METHODS
@@ -1476,7 +1630,6 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         unroll_kwargs = any(
             unroll_kwargs_class in parent_class.__name__ for unroll_kwargs_class in UNROLL_KWARGS_CLASSES
         )
-
     if unroll_kwargs:
         # get all unpackable "kwargs" parameters
         kwargs_parameters = [
@@ -1496,6 +1649,117 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
 
             # Process each kwarg parameter
             for param_name, param_type_annotation in kwarg_param.annotation.__args__[0].__annotations__.items():
+                # Handle nested kwargs structures for processors
+                if is_processor and param_name.endswith("_kwargs"):
+                    # Check if this is a basic kwargs type that should be skipped
+                    # Basic kwargs types are generic containers that shouldn't be documented as individual params
+                    basic_kwargs_types = ["TextKwargs", "ImagesKwargs", "VideosKwargs", "AudioKwargs"]
+
+                    # Get the actual type (unwrap Optional if needed)
+                    actual_type = param_type_annotation
+                    type_name = getattr(param_type_annotation, "__name__", None)
+                    if type_name is None and hasattr(param_type_annotation, "__origin__"):
+                        # Handle Optional[Type] or Union cases
+                        args = getattr(param_type_annotation, "__args__", ())
+                        for arg in args:
+                            if arg is not type(None):
+                                actual_type = arg
+                                type_name = getattr(arg, "__name__", None)
+                                break
+
+                    # Skip only if it's one of the basic kwargs types
+                    if type_name in basic_kwargs_types:
+                        continue
+
+                    # Otherwise, unroll the custom typed kwargs
+                    # Get the nested TypedDict's annotations
+                    if hasattr(actual_type, "__annotations__"):
+                        nested_kwargs_doc = getattr(actual_type, "__doc__", None)
+                        documented_nested_kwargs = {}
+                        if nested_kwargs_doc:
+                            documented_nested_kwargs = parse_docstring(nested_kwargs_doc)[0]
+
+                        # Only process fields that are documented in the custom kwargs class's own docstring
+                        # This prevents showing too many inherited parameters
+                        if not documented_nested_kwargs:
+                            # No documentation in the custom kwargs class, skip unrolling
+                            continue
+
+                        # Process each field in the custom typed kwargs
+                        for nested_param_name, nested_param_type in actual_type.__annotations__.items():
+                            # Only document parameters that are explicitly documented in the TypedDict's docstring
+                            if nested_param_name not in documented_nested_kwargs:
+                                continue
+                            nested_param_type_str = str(nested_param_type)
+                            nested_optional = False
+
+                            # Process parameter type
+                            if "typing" in nested_param_type_str:
+                                nested_param_type_str = "".join(nested_param_type_str.split("typing.")).replace(
+                                    "transformers.", "~"
+                                )
+                            else:
+                                nested_param_type_str = f"{nested_param_type_str.replace('transformers.', '~').replace('builtins', '')}.{nested_param_name}"
+                            if "ForwardRef" in nested_param_type_str:
+                                nested_param_type_str = re.sub(
+                                    r"ForwardRef\('([\w.]+)'\)", r"\1", nested_param_type_str
+                                )
+                            if "Optional" in nested_param_type_str:
+                                nested_param_type_str = re.sub(r"Optional\[(.*?)\]", r"\1", nested_param_type_str)
+                                nested_optional = True
+
+                            # Check for default value
+                            nested_param_default = ""
+                            if parent_class is not None:
+                                nested_param_default = str(getattr(parent_class, nested_param_name, ""))
+                                nested_param_default = (
+                                    f", defaults to `{nested_param_default}`" if nested_param_default != "" else ""
+                                )
+
+                            # Only use the TypedDict's own docstring, not source_args_dict
+                            # This prevents pulling in too many inherited parameters
+                            (
+                                nested_param_type_str,
+                                nested_optional_string,
+                                nested_shape_string,
+                                nested_additional_info,
+                                nested_description,
+                                nested_is_documented,
+                            ) = _get_parameter_info(
+                                nested_param_name,
+                                documented_nested_kwargs,
+                                {},  # Empty dict - only use TypedDict's own docstring
+                                nested_param_type_str,
+                                nested_optional,
+                            )
+
+                            # nested_is_documented should always be True here since we filter for it above
+                            # Check if type is missing
+                            if nested_param_type_str == "":
+                                print(
+                                    f"🚨 {nested_param_name} for {type_name} in file {func.__code__.co_filename} has no type"
+                                )
+                            nested_param_type_str = (
+                                nested_param_type_str if "`" in nested_param_type_str else f"`{nested_param_type_str}`"
+                            )
+                            # Format the parameter docstring
+                            if nested_additional_info:
+                                docstring += set_min_indent(
+                                    f"{nested_param_name} ({nested_param_type_str}{nested_additional_info}):{nested_description}",
+                                    indent_level + 8,
+                                )
+                            else:
+                                docstring += set_min_indent(
+                                    f"{nested_param_name} ({nested_param_type_str}{nested_shape_string}{nested_optional_string}{nested_param_default}):{nested_description}",
+                                    indent_level + 8,
+                                )
+
+                        # Skip processing the _kwargs parameter itself since we've processed its contents
+                        continue
+                    else:
+                        # If we can't get annotations, skip this parameter
+                        continue
+
                 param_type = str(param_type_annotation)
                 optional = False
 
@@ -1781,12 +2045,22 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
     from transformers.models import auto as auto_module
 
     is_dataclass = False
+    is_processor = False
     docstring_init = ""
     docstring_args = ""
     if "PreTrainedModel" in (x.__name__ for x in cls.__mro__):
         docstring_init = auto_method_docstring(
             cls.__init__, parent_class=cls, custom_args=custom_args, checkpoint=checkpoint
         ).__doc__.replace("Args:", "Parameters:")
+    elif "ProcessorMixin" in (x.__name__ for x in cls.__mro__):
+        is_processor = True
+        docstring_init = auto_method_docstring(
+            cls.__init__,
+            parent_class=cls,
+            custom_args=custom_args,
+            checkpoint=checkpoint,
+            source_args_dict=get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs]),
+        ).__doc__.replace("Args:", "Parameters:")
     elif "ModelOutput" in (x.__name__ for x in cls.__mro__):
         # We have a data class
         is_dataclass = True
@@ -1810,17 +2084,23 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
         model_name_lowercase = model_name_lowercase.replace("_", "-")
 
     name = re.findall(rf"({'|'.join(ClassDocstring.__dict__.keys())})$", cls.__name__)
-    if name == [] and custom_intro is None and not is_dataclass:
+    if name == [] and custom_intro is None and not is_dataclass and not is_processor:
         raise ValueError(
             f"`{cls.__name__}` is not registered in the auto doc. Here are the available classes: {ClassDocstring.__dict__.keys()}.\n"
             "Add a `custom_intro` to the decorator if you want to use `auto_docstring` on a class not registered in the auto doc."
         )
-    if name != [] or custom_intro is not None or is_dataclass:
+    if name != [] or custom_intro is not None or is_dataclass or is_processor:
         name = name[0] if name else None
         if custom_intro is not None:
             pre_block = equalize_indent(custom_intro, indent_level)
             if not pre_block.endswith("\n"):
                 pre_block += "\n"
+        elif is_processor:
+            # Generate processor intro dynamically
+            pre_block = generate_processor_intro(cls)
+            if pre_block:
+                pre_block = equalize_indent(pre_block, indent_level)
+                pre_block = format_args_docstring(pre_block, model_name_lowercase)
         elif model_name_title is None or name is None:
             pre_block = ""
         else:
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index fff7e882f457..4726b1b5d6dc 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -485,6 +485,23 @@
 }
 
 
+def has_auto_docstring_decorator(obj) -> bool:
+    try:
+        # Get the source lines for the object
+        source_lines = inspect.getsourcelines(obj)[0]
+
+        # Check the lines before the definition for @auto_docstring decorator
+        for line in source_lines[:10]:  # Check first 10 lines (decorators come before def/class)
+            line = line.strip()
+            if line.startswith("@auto_docstring"):
+                return True
+    except (TypeError, OSError):
+        # Some objects don't have source code available
+        pass
+
+    return False
+
+
 def find_indent(line: str) -> int:
     """
     Returns the number of spaces that start a line indent.
@@ -1423,6 +1440,10 @@ def check_docstrings(overwrite: bool = False, check_all: bool = False):
         if not callable(obj) or not isinstance(obj, type) or getattr(obj, "__doc__", None) is None:
             continue
 
+        # Skip objects decorated with @auto_docstring - they have auto-generated documentation
+        if has_auto_docstring_decorator(obj):
+            continue
+
         # If we are checking against the diff, we skip objects that are not part of the diff.
         if module_diff_files is not None:
             object_file = find_source_file(getattr(transformers, name))

From b542e95fc2aca2fed71814e993f17da23ef4ad47 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 7 Nov 2025 16:36:59 +0000
Subject: [PATCH 54/56] add auto_docstring to processors first part

---
 .../models/align/processing_align.py          | 31 +-------
 .../models/altclip/processing_altclip.py      | 16 +---
 .../models/aria/processing_aria.py            | 33 ++-------
 .../aya_vision/processing_aya_vision.py       | 70 ++++++------------
 .../models/bark/processing_bark.py            | 50 ++++---------
 .../models/blip/processing_blip.py            | 33 +--------
 .../models/blip_2/processing_blip_2.py        | 38 ++--------
 .../bridgetower/processing_bridgetower.py     | 17 +----
 .../models/bros/processing_bros.py            | 12 +--
 .../models/chameleon/processing_chameleon.py  | 41 ++--------
 .../chinese_clip/processing_chinese_clip.py   | 15 +---
 .../models/clap/processing_clap.py            | 21 +-----
 .../models/clip/processing_clip.py            | 20 +----
 .../models/clipseg/processing_clipseg.py      | 44 ++---------
 .../models/clvp/processing_clvp.py            | 20 +----
 .../processing_cohere2_vision.py              | 34 +--------
 .../models/colpali/processing_colpali.py      | 54 +++-----------
 .../models/colqwen2/processing_colqwen2.py    | 50 ++-----------
 src/transformers/models/csm/processing_csm.py | 59 ++-------------
 .../deepseek_vl/processing_deepseek_vl.py     | 44 ++---------
 .../processing_deepseek_vl_hybrid.py          | 48 +++---------
 src/transformers/models/dia/processing_dia.py | 28 ++-----
 .../models/donut/processing_donut.py          | 24 +-----
 .../models/evolla/processing_evolla.py        | 21 ++----
 .../models/flava/processing_flava.py          | 13 +---
 .../models/florence2/processing_florence2.py  | 49 +++---------
 .../models/fuyu/processing_fuyu.py            | 31 +-------
 .../models/gemma3/processing_gemma3.py        |  3 +
 .../models/gemma3n/processing_gemma3n.py      | 30 +++-----
 src/transformers/models/git/processing_git.py | 15 +---
 .../models/glm4v/processing_glm4v.py          | 37 +---------
 .../models/got_ocr2/processing_got_ocr2.py    | 55 +-------------
 .../processing_granite_speech.py              |  7 ++
 .../processing_grounding_dino.py              | 32 +-------
 .../models/idefics/processing_idefics.py      | 47 ++++--------
 .../models/idefics2/processing_idefics2.py    | 72 ++----------------
 .../models/idefics3/processing_idefics3.py    | 74 +++----------------
 .../instructblip/processing_instructblip.py   | 35 ++-------
 .../processing_instructblipvideo.py           | 27 ++-----
 .../models/internvl/processing_internvl.py    | 47 ++----------
 src/transformers/utils/auto_docstring.py      |  8 ++
 41 files changed, 248 insertions(+), 1157 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index ac927b8d2306..cc4b5e2a02fc 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class AlignProcessorKwargs(ProcessingKwargs, total=False):
@@ -29,36 +30,8 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AlignProcessor(ProcessorMixin):
-    r"""
-    Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
-    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import AlignProcessor
-        from PIL import Image
-        model_id = "kakaobrain/align-base"
-        processor = AlignProcessor.from_pretrained(model_id)
-
-        processor(
-            images=your_pil_image,
-            text=["What is that?"],
-            images_kwargs = {"crop_size": {"height": 224, "width": 224}},
-            text_kwargs = {"padding": "do_not_pad"},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        ```
-
-    Args:
-        image_processor ([`EfficientNetImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
-            The tokenizer is a required input.
-
-    """
-
     valid_processor_kwargs = AlignProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 933a5e48dfed..989dc7ecdbdd 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -17,24 +17,12 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
+@auto_docstring
 class AltCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
-    processor.
-
-    [`AltCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`XLMRobertaTokenizerFast`]. See
-    the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index c29c289649da..e8754c1a3df9 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -27,6 +27,7 @@
 from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ..auto import AutoTokenizer
 
 
@@ -52,21 +53,8 @@ class AriaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AriaProcessor(ProcessorMixin):
-    """
-    AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.
-
-    Args:
-        image_processor (`AriaImageProcessor`, *optional*):
-            The AriaImageProcessor to use for image preprocessing.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-        size_conversion (`Dict`, *optional*):
-            A dictionary indicating size conversions for images.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -74,6 +62,10 @@ def __init__(
         chat_template: Optional[str] = None,
         size_conversion: Optional[dict[Union[float, int], int]] = None,
     ):
+        """
+        size_conversion (`Dict`, *optional*):
+            A dictionary indicating size conversions for images.
+        """
         if size_conversion is None:
             size_conversion = {490: 128, 980: 256}
         self.size_conversion = {int(k): v for k, v in size_conversion.items()}
@@ -85,6 +77,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
@@ -92,18 +85,6 @@ def __call__(
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s).
-
-        Args:
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
index 049b0e5d24eb..08d42fff08a5 100644
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, make_flat_list_of_images
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -36,16 +37,26 @@ class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AyaVisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 28,
+        img_size: int = 364,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        downsample_factor: int = 1,
+        start_of_img_token="<|START_OF_IMG|>",
+        end_of_img_token="<|END_OF_IMG|>",
+        img_patch_token="<|IMG_PATCH|>",
+        img_line_break_token="<|IMG_LINE_BREAK|>",
+        tile_token="TILE",
+        tile_global_token="TILE_GLOBAL",
+        chat_template=None,
+        **kwargs,
+    ):
+        """
         patch_size (`int`, *optional*, defaults to 28):
             The size of image patches for tokenization.
         img_size (`int`, *optional*, defaults to 364):
@@ -66,27 +77,7 @@ class AyaVisionProcessor(ProcessorMixin):
             The token to be used to represent an image patch in the text.
         tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
             The token to be used to represent the cover image in the text.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        patch_size: int = 28,
-        img_size: int = 364,
-        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
-        downsample_factor: int = 1,
-        start_of_img_token="<|START_OF_IMG|>",
-        end_of_img_token="<|END_OF_IMG|>",
-        img_patch_token="<|IMG_PATCH|>",
-        img_line_break_token="<|IMG_LINE_BREAK|>",
-        tile_token="TILE",
-        tile_global_token="TILE_GLOBAL",
-        chat_template=None,
-        **kwargs,
-    ):
+        """
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
         self.image_token = image_token
@@ -125,6 +116,7 @@ def _prompt_split_image(self, num_patches):
         img_string += f"{self.end_of_img_token}"
         return img_string
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -132,24 +124,6 @@ def __call__(
         **kwargs: Unpack[AyaVisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 403d107f48f9..9c702013c740 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -26,6 +26,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.hub import cached_file
 from ..auto import AutoTokenizer
 
@@ -33,13 +34,16 @@
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class BarkProcessor(ProcessorMixin):
-    r"""
-    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
+    preset_shape = {
+        "semantic_prompt": 1,  # 1D array of shape (X,)
+        "coarse_prompt": 2,  # 2D array of shape (2,X)
+        "fine_prompt": 2,  # 2D array of shape (8,X)
+    }
 
-    Args:
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`].
+    def __init__(self, tokenizer, speaker_embeddings=None):
+        """
         speaker_embeddings (`dict[dict[str]]`, *optional*):
             Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
             `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
@@ -47,15 +51,7 @@ class BarkProcessor(ProcessorMixin):
             [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
             a list of `voice_preset_names`.
 
-    """
-
-    preset_shape = {
-        "semantic_prompt": 1,  # 1D array of shape (X,)
-        "coarse_prompt": 2,  # 2D array of shape (2,X)
-        "fine_prompt": 2,  # 2D array of shape (8,X)
-    }
-
-    def __init__(self, tokenizer, speaker_embeddings=None):
+        """
         super().__init__(tokenizer)
 
         self.speaker_embeddings = speaker_embeddings
@@ -260,6 +256,7 @@ def _verify_speaker_embeddings(self, remove_unavailable: bool = True):
                 for voice_preset in unavailable_keys:
                     del self.speaker_embeddings[voice_preset]
 
+    @auto_docstring
     def __call__(
         self,
         text=None,
@@ -272,26 +269,11 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
-        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
-        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
-        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            voice_preset (`str`, `dict[np.ndarray]`):
-                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
-                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
-                it can be a valid file name of a local `.npz` single voice preset containing the keys
-                `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        voice_preset (`str`, `dict[np.ndarray]`):
+            The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
+            `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+            it can be a valid file name of a local `.npz` single voice preset containing the keys
+            `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
 
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index 965164206c5a..50c7ae96a130 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -17,6 +17,7 @@
 """
 
 from typing import Optional, Union
+from ...utils.auto_docstring import auto_docstring
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
@@ -39,48 +40,20 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BlipProcessor(ProcessorMixin):
-    r"""
-    Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.
-
-    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the
-    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
 
     def __init__(self, image_processor, tokenizer, **kwargs):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[BlipProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 5949e2c648ce..36ba8bcf4557 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -44,23 +45,13 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Blip2Processor(ProcessorMixin):
-    r"""
-    Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
-
-    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
-    of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
+        """
         num_query_tokens (`int`, *optional*):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
+        """
         tokenizer.return_token_type_ids = False
         if not hasattr(tokenizer, "image_token"):
             self.image_token = AddedToken("<image>", normalized=False, special=True)
@@ -71,30 +62,13 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[Blip2ProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 5de97ec411dc..5ccd7aab5658 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
@@ -38,22 +39,8 @@ class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BridgeTowerProcessor(ProcessorMixin):
-    r"""
-    Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single
-    processor.
-
-    [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and
-    [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and
-    [`~BridgeTowerProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BridgeTowerImageProcessor`):
-            An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input.
-        tokenizer (`RobertaTokenizerFast`):
-            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = BridgeTowerProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
index d92b163955a7..a2317c43205e 100644
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class BrosProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,17 +35,8 @@ class BrosProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BrosProcessor(ProcessorMixin):
-    r"""
-    Constructs a Bros processor which wraps a BERT tokenizer.
-
-    [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of
-    [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.
-
-    Args:
-        tokenizer (`BertTokenizerFast`, *optional*):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
 
     valid_processor_kwargs = BrosProcessorKwargs
 
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 694be7ab8f26..418ef1d30147 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -30,6 +30,7 @@
     Unpack,
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class ChameleonTextKwargs(TextKwargs, total=False):
@@ -50,26 +51,15 @@ class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ChameleonProcessor(ProcessorMixin):
-    r"""
-    Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
-    processor.
-
-    [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`].
-    See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ChameleonImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        """
         image_seq_length (`int`, *optional*, defaults to 1024):
             Sequence length of one image embedding.
         image_token (`str`, *optional*, defaults to `"<image>"`):
             The special token used to indicate image in the text.
-    """
-
-    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        """
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -84,6 +74,7 @@ def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, ima
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -91,26 +82,6 @@ def __call__(
         **kwargs: Unpack[ChameleonProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 6508136f772e..e60944c330e7 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class ChineseCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
-    single processor.
-
-    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
-    See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ChineseCLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index a72151cb9b63..4a733f33cc4a 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -22,30 +22,20 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class ClapProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
-
-    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
-    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor ([`ClapFeatureExtractor`]):
-            The audio processor is a required input.
-        tokenizer ([`RobertaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
@@ -53,11 +43,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ):
-        """
-        Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
-        argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
         # again that the correct naming is used
         if audios is not None and audio is None:
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index b57e1f7b3009..3b20fff7490a 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -16,15 +16,8 @@
 Image/Text processor class for CLIP
 """
 
-from typing import Optional, Union
-
-from ...audio_utils import AudioInput
-from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...processing_utils import ProcessorMixin
 from ...utils import auto_docstring
-from ...video_utils import VideoInput
 
 
 @auto_docstring
@@ -32,16 +25,5 @@ class CLIPProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
-    @auto_docstring
-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        videos: Optional[VideoInput] = None,
-        audio: Optional[AudioInput] = None,
-        **kwargs: Unpack[ProcessingKwargs],
-    ) -> BatchFeature:
-        return super().__call__(images, text, videos, audio, **kwargs)
-
 
 __all__ = ["CLIPProcessor"]
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 4d431181cb4f..f856b97a4490 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -18,51 +18,21 @@
 
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class CLIPSegProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
-
-    [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ViTImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs):
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
-        the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
-                NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
-                (C, H, W), where C is a number of channels, H and W are image height and width.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
+            NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
+            (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py
index 331589a23999..41812fe21195 100644
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@@ -19,34 +19,20 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class ClvpProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor.
-
-    [`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the
-    [`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information.
-
-    Args:
-        feature_extractor (`ClvpFeatureExtractor`):
-            An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`ClvpTokenizer`):
-            An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
-    """
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
-        argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         raw_speech = kwargs.pop("raw_speech", None)
         if raw_speech is not None:
             logger.warning(
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
index b34fd1c5594e..690a1906e7cf 100644
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,20 +34,8 @@ class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Cohere2VisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -72,6 +61,7 @@ def __init__(
             ]
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -79,24 +69,6 @@ def __call__(
         **kwargs: Unpack[Cohere2VisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index 1ad511ced7a7..463be38d818d 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -72,27 +73,8 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
     return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
 
 
+@auto_docstring
 class ColPaliProcessor(ProcessorMixin):
-    r"""
-    Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
-    well as to compute the late-interaction retrieval score.
-
-    [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
-    for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
-            A string that gets tokenized and prepended to the image tokens.
-        query_prefix (`str`, *optional*, defaults to `"Question: "`):
-            A prefix to be used for the query.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -101,6 +83,12 @@ def __init__(
         visual_prompt_prefix: str = "Describe the image.",
         query_prefix: str = "Question: ",
     ):
+        """
+        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
+            A string that gets tokenized and prepended to the image tokens.
+        query_prefix (`str`, *optional*, defaults to `"Question: "`):
+            A prefix to be used for the query.
+        """
         self.visual_prompt_prefix = visual_prompt_prefix
         self.query_prefix = query_prefix
         if not hasattr(image_processor, "image_seq_length"):
@@ -124,6 +112,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -131,31 +120,6 @@ def __call__(
         **kwargs: Unpack[ColPaliProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
-        wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
-        both text and images at the same time.
-
-        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
-        [`~LlamaTokenizerFast.__call__`].
-        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
-        [`~SiglipImageProcessor.__call__`].
-        Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index 00f00c920856..a8d1db4fff4e 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -45,25 +46,8 @@ class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ColQwen2Processor(ProcessorMixin):
-    r"""
-    Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as
-    well as to compute the late-interaction retrieval score.
-
-    [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`]
-    for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens.
-        query_prefix (`str`, *optional*): A prefix to be used for the query.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -85,6 +69,7 @@ def __init__(
             query_prefix = "Query: "
         self.query_prefix = query_prefix
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -92,31 +77,10 @@ def __call__(
         **kwargs: Unpack[ColQwen2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
-        wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process
-        both text and images at the same time.
-
-        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's
-        [`~Qwen2TokenizerFast.__call__`].
-        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's
-        [`~Qwen2VLImageProcessor.__call__`].
-        Please refer to the doctsring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
+        visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
+            A string that gets tokenized and prepended to the image tokens.
+        query_prefix (`str`, *optional*, defaults to `"Query: "`):
+            A prefix to be used for the query.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py
index d77ffeffd896..ff59ee3f912b 100644
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from ...utils import is_soundfile_available, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -59,42 +60,8 @@ class CsmProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class CsmProcessor(ProcessorMixin):
-    r"""
-    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
-    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import CsmProcessor
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-        audio = ds[0]["audio"]["array"]
-
-        processor = CsmProcessor.from_pretrained("sesame/csm-1b")
-
-        processor(
-            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
-            audio=audio,
-            text_kwargs = {"padding": False},
-            audio_kwargs = {"sampling_rate": 16000},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
-        ```
-
-    Args:
-        feature_extractor ([`EncodecFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -189,6 +156,7 @@ def save_audio(
                 audio_value = audio_value.cpu().float().numpy()
             sf.write(p, audio_value, sampling_rate)
 
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]],
@@ -197,21 +165,7 @@ def __call__(
         depth_decoder_labels_ratio: Optional[float] = 1.0,
         **kwargs: Unpack[CsmProcessorKwargs],
     ):
-        r"""
-        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
-        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
-        the text. To prepare the audio, this method forwards the `audio` arguments to
-        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
-                tensor.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        """
             output_labels (bool, *optional*, default=False):
                 Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                 - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
@@ -219,10 +173,7 @@ def __call__(
                 - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
             depth_decoder_labels_ratio (float, *optional*, default=1.0):
                 The ratio of audio frames to keep for the depth decoder labels.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
+
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
index 22b1c2ab71dd..2e1ff47ad437 100644
--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,25 +34,8 @@ class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DeepseekVLProcessor(ProcessorMixin):
-    r"""
-    Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor.
-
-    [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DeepseekVLImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_image_tokens (`int`, *optional*, defaults to 576):
-            The number of special image tokens used as placeholders for visual content in text sequences.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -59,11 +43,16 @@ def __init__(
         chat_template=None,
         num_image_tokens=576,
     ):
+        """
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            The number of special image tokens used as placeholders for visual content in text sequences.
+        """
         self.image_token = tokenizer.image_token
         self.num_image_tokens = num_image_tokens
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -71,25 +60,6 @@ def __call__(
         **kwargs: Unpack[DeepseekVLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
index 8f842db7346f..db19160f0a71 100644
--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,25 +34,8 @@ class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DeepseekVLHybridProcessor(ProcessorMixin):
-    r"""
-    Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor.
-
-    [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DeepseekVLHybridImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_image_tokens (`int`, *optional*, defaults to 576):
-            The number of special image tokens used as placeholders for visual content in text sequences.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -59,11 +43,16 @@ def __init__(
         chat_template=None,
         num_image_tokens=576,
     ):
+        """
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            The number of special image tokens used as placeholders for visual content in text sequences.
+        """
         self.image_token = tokenizer.image_token
         self.num_image_tokens = num_image_tokens
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -71,32 +60,13 @@ def __call__(
         **kwargs: Unpack[DeepseekVLHybridProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-            `None`).
+                `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index 23c04687308c..0d1281f0744b 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -22,6 +22,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...utils import is_soundfile_available, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -61,27 +62,18 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DiaProcessor(ProcessorMixin):
-    r"""
-    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
-    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
-    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
-    information.
-
-    Args:
-        feature_extractor (`DiaFeatureExtractor`):
-            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`DiaTokenizer`):
-            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
-        audio_tokenizer (`DacModel`):
-            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
-    """
-
     audio_tokenizer_class = "DacModel"
 
     def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
+        """
+        audio_tokenizer (`DacModel`):
+            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
+        """
         super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[str, list[str]],
@@ -89,12 +81,6 @@ def __call__(
         output_labels: Optional[bool] = False,
         **kwargs: Unpack[DiaProcessorKwargs],
     ):
-        """
-        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
-        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
-        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
-        to the docstring of the above methods for more information.
-        """
         if not is_torch_available():
             raise ValueError(
                 "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index fedd173117eb..c004b9499df7 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 class DonutProcessorKwargs(ProcessingKwargs, total=False):
@@ -32,37 +33,18 @@ class DonutProcessorKwargs(ProcessingKwargs, total=False):
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class DonutProcessor(ProcessorMixin):
-    r"""
-    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
-    processor.
-
-    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
-    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
-    [`~DonutProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DonutImageProcessor`], *optional*):
-            An instance of [`DonutImageProcessor`]. The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
-            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[DonutProcessorKwargs],
     ):
-        """
-        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
-        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
-        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
-        [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 807bd294c406..afc0812f5f80 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -22,30 +22,23 @@
 from ...processing_utils import (
     ProcessorMixin,
 )
+from ...utils.auto_docstring import auto_docstring
 
 
 PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
 
 
+@auto_docstring
 class EvollaProcessor(ProcessorMixin):
-    r"""
-    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.
-
-    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
-    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.
-
-    Args:
+    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        """
         protein_tokenizer (`EsmTokenizer`):
             An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
-        tokenizer (`LlamaTokenizerFast`, *optional*):
-            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
         protein_max_length (`int`, *optional*, defaults to 1024):
             The maximum length of the sequence to be generated.
         text_max_length (`int`, *optional*, defaults to 512):
             The maximum length of the text to be generated.
-    """
-
-    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        """
         if protein_tokenizer is None:
             raise ValueError("You need to specify an `protein_tokenizer`.")
         if tokenizer is None:
@@ -94,6 +87,7 @@ def process_text(
         )
         return prompt_inputs
 
+    @auto_docstring
     def __call__(
         self,
         proteins: Optional[Union[list[dict], dict]] = None,
@@ -102,7 +96,8 @@ def __call__(
         text_max_length: Optional[int] = None,
         **kwargs,
     ):
-        r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
+        r"""
+        This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
         the model.
 
         Args:
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 7e5b3c0e012e..0bb603753788 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -17,20 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class FlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
-
-    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py
index c8d699e4bc3e..d6fca01ef2c1 100644
--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@@ -28,6 +28,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -42,26 +43,8 @@ class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Florence2Processor(ProcessorMixin):
-    r"""
-    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
-
-    [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the
-    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
-
-    Args:
-        image_processor (`AutoImageProcessor`, *optional*):
-            The image processor is a required input.
-        tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*):
-            The tokenizer is a required input.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-        post_processor_config (`dict`,  *optional*, defaults to 0):
-            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
-            thresholds, or banned tokens.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -70,6 +53,14 @@ def __init__(
         post_processor_config: Optional[dict] = None,
         **kwargs,
     ):
+        """
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        post_processor_config (`dict`,  *optional*, defaults to 0):
+            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
+            thresholds, or banned tokens.
+        """
         self.tasks_answer_post_processing_type = {
             "<OCR>": "pure_text",
             "<OCR_WITH_REGION>": "ocr",
@@ -143,6 +134,7 @@ def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]:
             prompts.append(prompt)
         return prompts
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -150,25 +142,6 @@ def __call__(
         **kwargs: Unpack[Florence2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index ee697deccf9e..ceda84912ba5 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging, requires_backends
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -333,20 +334,8 @@ def scale_bbox_to_transformed_image(
 
 
 @requires(backends=("vision",))
+@auto_docstring
 class FuyuProcessor(ProcessorMixin):
-    r"""
-    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
-
-    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`FuyuImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor=image_processor, tokenizer=tokenizer)
         self.image_processor = image_processor
@@ -478,6 +467,7 @@ def get_sample_encoding(
         }
         return batch_encoding
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -485,21 +475,6 @@ def __call__(
         **kwargs: Unpack[FuyuProcessorKwargs],
     ) -> "FuyuBatchFeature":
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
-        encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `list[PIL.Image.Image]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:
 
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
index 11574e30b7c1..d8d8e3d61e26 100644
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -23,6 +23,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import to_py_obj
+from ...utils.auto_docstring import auto_docstring
 
 
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
@@ -41,6 +42,7 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Gemma3Processor(ProcessorMixin):
     def __init__(
         self,
@@ -64,6 +66,7 @@ def __init__(
             **kwargs,
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py
index 51b686557ed0..9be0c408c129 100644
--- a/src/transformers/models/gemma3n/processing_gemma3n.py
+++ b/src/transformers/models/gemma3n/processing_gemma3n.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
@@ -29,28 +30,8 @@ class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Gemma3nProcessor(ProcessorMixin):
-    """
-    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
-    into a single processor.
-
-    Args:
-        feature_extractor (`Gemma3nAudioFeatureExtractor`):
-            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
-            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
-        image_processor (`SiglipImageProcessorFast`):
-            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
-            with a `pixel_values` feature.
-        tokenizer (`GemmaTokenizerFast`):
-            The text tokenizer for the model.
-        chat_template (`string`, *optional*):
-            A Jinja template for generating text prompts from a set of messages.
-        audio_seq_length (int, *optional*, defaults to 188):
-            The number of audio soft tokens that will be added to the text prompt
-        image_seq_length (int, *optional*, defaults to 256):
-            The number of image soft tokens that should be added to
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -61,6 +42,12 @@ def __init__(
         image_seq_length: int = 256,
         **kwargs,
     ):
+        """
+        audio_seq_length (int, *optional*, defaults to 188):
+            The number of audio soft tokens that will be added to the text prompt
+        image_seq_length (int, *optional*, defaults to 256):
+            The number of image soft tokens that should be added to
+        """
         self.audio_seq_length = audio_seq_length
         self.audio_token_id = tokenizer.audio_token_id
         self.boa_token = tokenizer.boa_token
@@ -83,6 +70,7 @@ def __init__(
             **kwargs,
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 89cfc9618987..80e4f31ad3e1 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class GitProcessor(ProcessorMixin):
-    r"""
-    Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor.
-
-    [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`AutoImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`AutoTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 79935cbde7b4..8dd68393a76b 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,21 +45,8 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Glm4vProcessor(ProcessorMixin):
-    r"""
-    Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
-    [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Glm4vProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Glm4vVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -74,6 +62,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -82,26 +71,6 @@ def __call__(
         **kwargs: Unpack[Glm4vProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 162efef5e9f9..9b5a773ef4ec 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_vision_available():
@@ -79,20 +80,8 @@ def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, in
     return list(box)
 
 
+@auto_docstring
 class GotOcr2Processor(ProcessorMixin):
-    r"""
-    Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information.
-    Args:
-        image_processor ([`GotOcr2ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
@@ -127,6 +116,7 @@ def _make_list_of_inputs(self, images, text, box, color, multi_page):
 
         return images, text, box, color
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -134,45 +124,6 @@ def __call__(
         **kwargs: Unpack[GotOcr2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
-        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            format (`bool`, *optional*):
-                If set, will add the format token to the query, and the model will return the OCR result with formatting.
-            box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*):
-                The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it
-                will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the
-                form (x1, y1, x2, y2).
-            color (`str`, *optional*):
-                The color annotation to be added to the query. The model will return the OCR result within the box with
-                the specified color.
-            multi_page (`bool`, *optional*):
-                If set, will enable multi-page inference. The model will return the OCR result across multiple pages.
-            crop_to_patches (`bool`, *optional*):
-                If set, will crop the image to patches. The model will return the OCR result upon the patch reference.
-            min_patches (`int`, *optional*):
-                The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-            max_patches (`int`, *optional*):
-                The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 910840bd661c..51e7ab8c9031 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires_backends
 
 
@@ -29,6 +30,7 @@
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class GraniteSpeechProcessor(ProcessorMixin):
     def __init__(
         self,
@@ -37,9 +39,14 @@ def __init__(
         audio_token="<|audio|>",
         chat_template=None,
     ):
+        """
+        audio_toke (str, *optional*, defaults to "<|audio|>"):
+            The audio token to use for the processor.
+        """
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
         super().__init__(audio_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 74565588d852..60258b209d19 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -24,6 +24,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -114,47 +115,20 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class GroundingDinoProcessor(ProcessorMixin):
-    r"""
-    Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
-    single processor.
-
-    [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and
-    [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`GroundingDinoImageProcessor`):
-            An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = GroundingDinoProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Args:
-            images (`ImageInput`, `list[ImageInput]`, *optional*):
-                The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
-                Candidate labels to be detected on the image. The text might be one of the following:
-                - A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]).
-                - A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]).
-                - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
-                - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
-        """
         if text is not None:
             text = self._preprocess_input_text(text)
         return super().__call__(images=images, text=text, **kwargs)
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 7cb640e56854..00d368c1fc2e 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -29,6 +29,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
@@ -135,25 +136,15 @@ def is_url(string):
     return all([result.scheme, result.netloc])
 
 
+@auto_docstring
 class IdeficsProcessor(ProcessorMixin):
-    r"""
-    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
-
-    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`IdeficsImageProcessor`):
-            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
-        tokenizer (`LlamaTokenizerFast`):
-            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
-        image_size (`int`, *optional*, defaults to 224):
-            Image size (assuming a square image)
-        add_end_of_utterance_token (`str`, *optional*):
-            The string representation of token representing end of utterance
-    """
-
     def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        """
+        image_size (int, *optional*, defaults to 224):
+            The size of the image to be processed.
+        add_end_of_utterance_token (bool, *optional*, defaults to None):
+            Whether to add the end of utterance token to the text.
+        """
         super().__init__(image_processor, tokenizer)
         self.image_token_id = (
             tokenizer.image_token_id
@@ -172,6 +163,7 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
         )
 
     @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]] = None,
@@ -185,29 +177,16 @@ def __call__(
         ] = None,
         **kwargs: Unpack[IdeficsProcessorKwargs],
     ) -> BatchFeature:
-        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
-        the model was trained on and prepares the image pixel values for the model to process.
-
-        Args:
-            images (`Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]]`):
-                either a single image or a batched list of images - can be passed in when text contains only text prompts,
-                in order to use the image-text-to-text behavior.
-            text (`Union[list[TextInput], [list[list[TextInput]]]]`):
-                either a single prompt or a batched list of prompts - see the detailed description immediately after
-                the end of the arguments doc section.
-            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
-                The type of tensors to return. Can be one of:
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-
+        """
         Returns:
             a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
             directly passed to `model.generate`
 
-        Detailed explanation:
+            Detailed explanation:
 
-        Each entry in `text` is either a text to be passed as is or an image that will be processed.
+            Each entry in `text` is either a text to be passed as is or an image that will be processed.
 
-        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+            An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
 
         When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
         entry into the prompt.
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index df5f9ca73a8b..05f7153fdfea 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -28,6 +28,7 @@
 )
 from ...tokenization_utils_base import AddedToken, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -55,29 +56,17 @@ class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Idefics2Processor(ProcessorMixin):
-    r"""
-    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
-
-    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Idefics2ImageProcessor`):
-            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
+    ):
+        """
         image_seq_len (`int`, *optional*, defaults to 64):
             The length of the image sequence i.e. the number of <image> tokens per image in the input.
             This parameter is used to build the string from the input prompt and image tokens and should match the
             config.perceiver_config.resampler_n_latents value for the model used.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
-    ):
+        """
         if not hasattr(tokenizer, "image_token"):
             self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
             self.image_token = AddedToken("<image>", normalized=False, special=True).content
@@ -107,58 +96,13 @@ def _extract_images_from_prompts(self, prompts):
             prompt_images.append(images)
         return prompt_images
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
         text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
         **kwargs: Unpack[Idefics2ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import Idefics2Processor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-
-        """
         if text is None and images is None:
             raise ValueError("You must provide either `text` or `images`.")
 
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 5c978eb3b230..73584110b55e 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -101,29 +102,17 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Idefics3Processor(ProcessorMixin):
-    r"""
-    Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
-
-    [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Idefics3ImageProcessor`):
-            An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
+    ):
+        """
         image_seq_len (`int`, *optional*, defaults to 169):
             The length of the image sequence i.e. the number of <image> tokens per image in the input.
             This parameter is used to build the string from the input prompt and image tokens and should match the
             value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
-    ):
+        """
         self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
         self.image_token = AddedToken("<image>", normalized=False, special=True).content
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
@@ -164,6 +153,7 @@ def _extract_images_from_prompts(self, prompts):
             prompt_images.append(images)
         return prompt_images
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
@@ -172,51 +162,9 @@ def __call__(
         **kwargs: Unpack[Idefics3ProcessorKwargs],
     ) -> BatchEncoding:
         """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import Idefics3Processor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
-                image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
+        image_seq_len (`int`, *optional*):
+            The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
+            image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
         """
         if text is None and images is None:
             raise ValueError("You must provide either `text` or `images`.")
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index cfed52f745ae..017e0e2564d2 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -44,26 +45,15 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class InstructBlipProcessor(ProcessorMixin):
-    r"""
-    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
-    processor.
-
-    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
-    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         qformer_tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
         num_query_tokens (`int`, *optional*):"
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         if not hasattr(tokenizer, "image_token"):
             self.image_token = AddedToken("<image>", normalized=False, special=True)
             tokenizer.add_tokens([self.image_token], special_tokens=True)
@@ -73,26 +63,13 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_toke
 
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[InstructBlipProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-        """
         if images is None and text is None:
             raise ValueError("You have to specify at least images or text.")
 
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 81d0103b2742..cc776e5a1e70 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -28,32 +28,22 @@
     TruncationStrategy,
 )
 from ...utils import TensorType, logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class InstructBlipVideoProcessor(ProcessorMixin):
-    r"""
-    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
-    processor.
-
-    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoVideoProcessor`] and [`AutoTokenizer`]. See the
-    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.
-
-    Args:
-        video_processor (`InstructBlipVideoVideoProcessor`):
-            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         qformer_tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
         num_query_tokens (`int`, *optional*):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         if not hasattr(tokenizer, "video_token"):
             self.video_token = AddedToken("<video>", normalized=False, special=True)
             tokenizer.add_tokens([self.video_token], special_tokens=True)
@@ -62,6 +52,7 @@ def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_toke
         self.num_query_tokens = num_query_tokens
         super().__init__(video_processor, tokenizer, qformer_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[VideoInput] = None,
@@ -82,12 +73,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
-        """
-        This method uses [`InstructBlipVideoVideoProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify at least one of images or text.")
 
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index fd2a52a768ab..582e389d1d6c 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -39,25 +40,8 @@ class InternVLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class InternVLProcessor(ProcessorMixin):
-    r"""
-    Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`AutoVideoProcessor`], *optional*):
-            The video processor is a required input.
-        image_seq_length (`int`, *optional*, defaults to 256):
-            The number of image token to use per image patch. it should be set so that:
-            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -67,6 +51,11 @@ def __init__(
         chat_template=None,
         **kwargs,
     ):
+        """
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image token to use per image patch. it should be set so that:
+            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
+        """
         self.image_seq_length = image_seq_length
         self.start_image_token = tokenizer.start_image_token
         self.end_image_token = tokenizer.end_image_token
@@ -143,6 +132,7 @@ def _insert_media_placeholders(
 
         return processed_text, image_video_patches, image_index, video_index
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -151,27 +141,6 @@ def __call__(
         **kwargs: Unpack[InternVLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
-        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index a4d7c3b68517..3d37c8f07db3 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -313,6 +313,14 @@ class ProcessorArgs:
     """,
     }
 
+    audios = {
+        "description": """
+    The audio or batch of audios to be prepared. Each audio can be a NumPy array or PyTorch tensor.
+    In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+    and T is the sample length of the audio.
+    """,
+    }
+
 
 class ModelArgs:
     labels = {

From 552509cedb5b11f6b1f37e0eddc79137142d3152 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 7 Nov 2025 17:07:07 +0000
Subject: [PATCH 55/56] add auto_docstring to processors part 2

---
 .../models/janus/processing_janus.py          | 43 ++-------
 .../models/kosmos2/processing_kosmos2.py      | 58 +++++------
 .../models/kosmos2_5/processing_kosmos2_5.py  | 27 ++----
 .../processing_kyutai_speech_to_text.py       |  9 +-
 .../layoutlmv2/processing_layoutlmv2.py       | 32 +------
 .../layoutlmv3/processing_layoutlmv3.py       | 33 +------
 .../models/layoutxlm/processing_layoutxlm.py  | 32 +------
 .../models/lfm2_vl/processing_lfm2_vl.py      | 30 +-----
 .../models/llama4/processing_llama4.py        | 68 ++++---------
 .../models/llava/processing_llava.py          | 59 +++---------
 .../llava_next/processing_llava_next.py       | 55 +++--------
 .../processing_llava_next_video.py            | 72 ++++----------
 .../processing_llava_onevision.py             | 61 ++++--------
 .../models/markuplm/processing_markuplm.py    | 31 +-----
 .../models/mgp_str/processing_mgp_str.py      | 22 +----
 .../models/mllama/processing_mllama.py        | 52 +---------
 .../models/musicgen/processing_musicgen.py    | 22 +----
 .../processing_musicgen_melody.py             | 23 +----
 .../models/nougat/processing_nougat.py        | 16 +---
 .../omdet_turbo/processing_omdet_turbo.py     | 38 +-------
 .../models/oneformer/processing_oneformer.py  | 58 ++++-------
 .../models/ovis2/processing_ovis2.py          | 43 ++-------
 .../models/owlv2/processing_owlv2.py          | 42 ++------
 .../models/owlvit/processing_owlvit.py        | 46 ++-------
 .../models/paligemma/processing_paligemma.py  | 42 ++------
 .../models/parakeet/processing_parakeet.py    |  3 +
 .../perception_lm/processing_perception_lm.py | 49 ++--------
 .../processing_phi4_multimodal.py             | 45 ++-------
 .../pix2struct/processing_pix2struct.py       | 23 +----
 .../models/pixtral/processing_pixtral.py      | 60 +++---------
 .../models/pop2piano/processing_pop2piano.py  | 24 +----
 .../qwen2_5_omni/processing_qwen2_5_omni.py   | 45 +--------
 .../qwen2_5_vl/processing_qwen2_5_vl.py       | 39 +-------
 .../qwen2_audio/processing_qwen2_audio.py     | 48 +++-------
 .../models/qwen2_vl/processing_qwen2_vl.py    | 39 +-------
 .../processing_qwen3_omni_moe.py              | 45 +--------
 .../models/qwen3_vl/processing_qwen3_vl.py    | 39 +-------
 src/transformers/models/sam/processing_sam.py | 19 +---
 .../models/sam2/processing_sam2.py            | 53 ++++------
 .../sam2_video/processing_sam2_video.py       | 57 ++++-------
 .../models/sam_hq/processing_samhq.py         | 22 +----
 .../seamless_m4t/processing_seamless_m4t.py   | 33 +------
 .../models/siglip/processing_siglip.py        | 15 +--
 .../models/siglip2/processing_siglip2.py      | 15 +--
 .../models/smolvlm/processing_smolvlm.py      | 77 ++-------------
 .../processing_speech_to_text.py              | 25 +----
 .../models/speecht5/processing_speecht5.py    | 16 +---
 .../models/trocr/processing_trocr.py          | 23 +----
 src/transformers/models/tvp/processing_tvp.py | 15 +--
 .../models/udop/processing_udop.py            |  9 +-
 .../video_llama_3/processing_video_llama_3.py | 38 +-------
 .../video_llava/processing_video_llava.py     | 96 ++++++-------------
 .../models/vilt/processing_vilt.py            | 15 +--
 .../processing_vision_text_dual_encoder.py    | 17 +---
 .../models/voxtral/processing_voxtral.py      | 33 +------
 .../models/wav2vec2/processing_wav2vec2.py    | 27 +-----
 .../wav2vec2_bert/processing_wav2vec2_bert.py | 32 +------
 .../processing_wav2vec2_with_lm.py            | 27 ++----
 .../models/whisper/processing_whisper.py      | 22 +----
 .../models/x_clip/processing_x_clip.py        | 15 +--
 60 files changed, 406 insertions(+), 1768 deletions(-)

diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index 354570314a78..0d642594b60c 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -46,25 +47,13 @@ class JanusProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class JanusProcessor(ProcessorMixin):
-    r"""
-    Constructs a Janus processor which wraps a Janus Image Processor and a Llama tokenizer into a single processor.
-
-    [`JanusProcessor`] offers all the functionalities of [`JanusImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`JanusImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        use_default_system_prompt (`str`, *optional*, defaults to `False`):
-            Use default system prompt for Text Generation.
-    """
-
     def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs):
+        """
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Use default system prompt for Text Generation.
+        """
         self.num_image_tokens = 576
         self.image_token = tokenizer.image_token
         self.image_start_token = tokenizer.boi_token
@@ -73,6 +62,7 @@ def __init__(self, image_processor, tokenizer, chat_template=None, use_default_s
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -80,25 +70,6 @@ def __call__(
         **kwargs: Unpack[JanusProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index d6fd1e6ec758..744c8785a23c 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -24,6 +24,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_base import BatchEncoding, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 BboxInput = Union[
@@ -37,12 +38,28 @@
 
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
+    """
+    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
+        The bounding bboxes associated to `texts`.
+    num_image_tokens (`int`, *optional* defaults to 64):
+        The number of (consecutive) places that are used to mark the placeholders to store image information.
+        This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
+    first_image_token_id (`int`, *optional*):
+        The token id that will be used for the first place of the subsequence that is reserved to store image
+        information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
+    """
+
     bboxes: Optional[NestedList]  # NOTE: hub validators can't accept `Sequence`
     num_image_tokens: int
     first_image_token_id: Optional[int]
 
 
 class Kosmos2TextKwargs(TextKwargs, total=False):
+    """
+    add_eos_token (`bool`, defaults to `False`):
+    Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
+    """
+
     add_eos_token: bool
 
 
@@ -67,25 +84,13 @@ class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Kosmos2Processor(ProcessorMixin):
-    r"""
-    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
-    processor.
-
-    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
-    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`CLIPImageProcessor`):
-            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
-        tokenizer (`XLMRobertaTokenizerFast`):
-            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
+        """
         num_patch_index_tokens (`int`, *optional*, defaults to 1024):
             The number of tokens that represent patch indices.
-    """
-
-    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
+        """
         tokenizer.return_token_type_ids = False
 
         self.eod_token = "</doc>"
@@ -130,32 +135,13 @@ def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwa
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
         **kwargs: Unpack[Kosmos2ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.
-
-        Args:
-            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
-                The bounding bboxes associated to `texts`.
-            num_image_tokens (`int`, *optional* defaults to 64):
-                The number of (consecutive) places that are used to mark the placeholders to store image information.
-                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
-            first_image_token_id (`int`, *optional*):
-                The token id that will be used for the first place of the subsequence that is reserved to store image
-                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
-            add_eos_token (`bool`, defaults to `False`):
-                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 5d1ec20c75de..95b416eb79c4 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -44,22 +45,13 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Kosmos2_5Processor(ProcessorMixin):
-    r"""
-    Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
-    processor.
-
-    [`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`PreTrainedTokenizerFast`]. See
-    the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
-
-    Args:
-        image_processor (`Kosmos2_5ImageProcessor`):
-            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
+        """
         num_image_tokens (`int`, *optional*, defaults to 2048):
             Number of image tokens used as a placeholder.
-    """
+        """
 
     def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
@@ -68,20 +60,13 @@ def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.num_image_tokens = num_image_tokens
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
         **kwargs: Unpack[Kosmos2_5ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
index 53c6b7d395df..f496bdc23ec1 100644
--- a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@@ -15,6 +15,7 @@
 
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
@@ -26,14 +27,8 @@ class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class KyutaiSpeechToTextProcessor(ProcessorMixin):
-    r"""
-    Constructs a Moshi ASR processor which wraps [`EncodecFeatureExtractor`] and
-    [`PreTrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities. See the [`~KyutaiSpeechToTextProcessor.__call__`] for more
-    information.
-    """
-
     valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
 
     def __init__(self, feature_extractor, tokenizer):
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 0f3e7dc8a9d9..9095713911f9 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutLMv2Processor(ProcessorMixin):
-    r"""
-    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
-    single processor.
-
-    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
-    [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
-            An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,16 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
-        [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
-        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
-        arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 5f7de3dd9147..97f192fddca9 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutLMv3Processor(ProcessorMixin):
-    r"""
-    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
-    single processor.
-
-    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
-    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
-            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,17 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
-        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
-        together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
-        `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
-        with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
-        resized and normalized `pixel_values`.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 887d150ab366..7882f6827e2f 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutXLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
-    processor.
-
-    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
-    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
-            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,16 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
-        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
-        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
-        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 73038b9f37aa..7766f5a5043b 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -25,6 +25,7 @@
 )
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -49,21 +50,8 @@ class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Lfm2VlProcessor(ProcessorMixin):
-    r"""
-    Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2VlImageProcessor into a single processor.
-
-    [`Lfm2VlProcessor`] offers all the functionalities of [`Lfm2ImageProcessor`] and [`Lfm2Tokenizer`].
-
-    Args:
-        image_processor (`Lfm2VlImageProcessor`):
-             An instance of [`Lfm2VlImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -78,25 +66,13 @@ def __init__(
         self.image_thumbnail_token = tokenizer.image_thumbnail
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None,
         text: Optional[Union[TextInput, list[TextInput]]] = None,
         **kwargs: Unpack[Lfm2VlProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        Processes the input prompts and returns a BatchFeature.
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`TextInput`, *optional*):
-                The sequence or batch of sequences to be encoded.
-                Wherever an image token, `<image>` is encountered it is expanded to a proper sequence of image tokens.
-            return_tensors (`Optional[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-        """
         if text is None and images is None:
             raise ValueError("You must provide one of `text` or `images`.")
 
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
index c9ad6884fa8d..162c156e5c63 100644
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -21,6 +21,7 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
+from ...utils.auto_docstring import auto_docstring
 
 
 class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,16 +35,25 @@ class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
 chat_template = "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}    \n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content']|trim %}\n    {%- else %}\n        {#- FIXME: The processor requires an array, always. #}\n        {%- set system_message = messages[0]['content'][0]['text']|trim %}\n    {%- endif %}\n    {%- set messages = messages[1:] %}\n    {%- set user_supplied_system_message = true %}\n{%- else %}\n    {%- set system_message = \"\" %}\n    {%- set user_supplied_system_message = false %}\n{%- endif %}\n\n{#- System message if the user supplied one #}\n{%- if user_supplied_system_message %}\n    {{- \"<|header_start|>system<|header_end|>\n\n\" }}\n    {%- if tools is not none %}\n        {{- \"Environment: ipython\n\" }}\n    {%- endif %}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n        {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n        {{- \"Do not use variables.\n\n\" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- \"\n\n\" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- \"<|eot|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|header_start|>user<|header_end|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}\n       {{- '<|header_start|>assistant<|header_end|>\n\n' -}}\n       {{- '<|python_start|>' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n       {{- '<|python_end|>' }}\n        {%- for tool_call in message.tool_calls %}\n           {{- '{\"name\": \"' + tool_call.function.name + '\", ' }}\n           {{- '\"parameters\": ' }}\n           {{- tool_call.function.arguments | tojson }}\n           {{- \"}\" }}\n        {%- endfor %}\n       {{- \"<|eot|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|header_start|>ipython<|header_end|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|header_start|>assistant<|header_end|>\n\n' }}\n{%- endif %}\n"
 
 
+@auto_docstring
 class Llama4Processor(ProcessorMixin):
-    r"""
-    Constructs a Llama4 processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~Llama4Processor.__call__`] and [`~Llama4Processor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 14,
+        pixel_shuffle_ratio: float = 0.5,
+        fake_image_token="<|image|>",
+        image_token="<|image|>",
+        start_of_image_token="<|image_start|>",
+        end_of_image_token="<|image_end|>",
+        patch_token="<|patch|>",
+        tile_x_separator_token="<|tile_x_separator|>",
+        tile_y_separator_token="<|tile_y_separator|>",
+        chat_template=chat_template,
+        **kwargs,
+    ):
+        """
         patch_size (`int`, *optional*, defaults to 28):
             The size of image patches for tokenization.
         img_size (`int`, *optional*, defaults to 364):
@@ -64,26 +74,7 @@ class Llama4Processor(ProcessorMixin):
             The token to be used to represent an image patch in the text.
         tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
             The token to be used to represent the cover image in the text.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        patch_size: int = 14,
-        pixel_shuffle_ratio: float = 0.5,
-        fake_image_token="<|image|>",
-        image_token="<|image|>",
-        start_of_image_token="<|image_start|>",
-        end_of_image_token="<|image_end|>",
-        patch_token="<|patch|>",
-        tile_x_separator_token="<|tile_x_separator|>",
-        tile_y_separator_token="<|tile_y_separator|>",
-        chat_template=chat_template,
-        **kwargs,
-    ):
+        """
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
         self.downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
@@ -125,6 +116,7 @@ def _prompt_split_image(self, aspect_ratio, num_patches_per_chunk):
 
         return img_string
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -132,24 +124,6 @@ def __call__(
         **kwargs: Unpack[Llama4ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        Llama4ImageProcessor's [`~Llama4ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index a11e80280b74..62bbe28e7f98 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -41,32 +42,8 @@ class LlavaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -78,6 +55,18 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -85,6 +74,7 @@ def __init__(
         self.image_token_id = tokenizer.encode(self.image_token, add_special_tokens=False)[0]
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -92,25 +82,6 @@ def __call__(
         **kwargs: Unpack[LlavaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index d79c5a0edf6b..9ed72c3a550e 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -48,32 +49,8 @@ class LlavaNextProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaNextProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -85,6 +62,18 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -96,6 +85,7 @@ def __init__(
         )
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -103,21 +93,6 @@ def __call__(
         **kwargs: Unpack[LlavaNextProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 582002b6165c..42d0eeca57ad 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,37 +45,8 @@ class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaNextVideoProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-NeXT-Video processor which wraps a LLaVa-NeXT image processor, LLaVa-NeXT-Video video processor and
-    a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextVideoProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`], [`LlavaNextVideoVideoProcessor`] and
-    [`LlamaTokenizerFast`]. See the [`~LlavaNextVideoProcessor.__call__`] and [`~LlavaNextVideoProcessor.decode`] for more information.
-
-    Args:
-        video_processor ([`LlavaNextVideoVideoProcessor`], *optional*):
-            The video processor is a required input.
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            Jinja chat template that will be used in tokenizer's `apply_chat_template`
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     # video and image processor share same args, but have different processing logic
     # only image processor config is saved in the hub
     def __init__(
@@ -90,6 +62,20 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -107,6 +93,7 @@ def __init__(
         )
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -115,31 +102,6 @@ def __call__(
         **kwargs: Unpack[LlavaNextVideoProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
-        this method forwards the `videos` and `kwargs` arguments to LlavaNextVideoVideoProcessor's
-        [`~LlavaNextVideoVideoProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 4ea891e50cf1..35d8271774bd 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -28,6 +28,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -45,35 +46,8 @@ class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaOnevisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`LlavaOnevisionVideoProcessor`], *optional*):
-            The video processor is a required input.
-        num_image_tokens (`int`, *optional*):
-            Number of image tokens for one imagethat will be returned by vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
-            Aspect ratio used when processong image features. The default value is "anyres_max_9".
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -87,6 +61,19 @@ def __init__(
         vision_aspect_ratio="anyres_max_9",
         **kwargs,
     ):
+        """
+        num_image_tokens (`int`, *optional*):
+            Number of image tokens for one imagethat will be returned by vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
+            Aspect ratio used when processong image features. The default value is "anyres_max_9".
+        """
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
@@ -104,6 +91,7 @@ def __init__(
         self.vision_aspect_ratio = vision_aspect_ratio
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -112,23 +100,6 @@ def __call__(
         **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index 5c2f181d35a6..2b1ffc3e1e17 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -21,33 +21,17 @@
 from ...file_utils import TensorType
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class MarkupLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
-    processor.
-
-    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
-    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
-    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.
-
-    Args:
-        feature_extractor (`MarkupLMFeatureExtractor`):
-            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
-            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
-        parse_html (`bool`, *optional*, defaults to `True`):
-            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
-    """
-
     parse_html = True
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         html_strings=None,
@@ -71,15 +55,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
-        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
-        returns the output.
-
-        Optionally, one can also provide a `text` argument which is passed along as first sequence.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # first, create nodes and xpaths
         if self.parse_html:
             if html_strings is None:
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 7686b43f00e8..2319dcfc17c9 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -19,6 +19,7 @@
 from transformers.utils.generic import ExplicitEnum
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -36,20 +37,8 @@ class DecodeType(ExplicitEnum):
 
 
 @requires(backends=("sentencepiece",))
+@auto_docstring
 class MgpstrProcessor(ProcessorMixin):
-    r"""
-    Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
-
-    [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
-    [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
-
-    Args:
-        image_processor (`ViTImageProcessor`, *optional*):
-            An instance of `ViTImageProcessor`. The image processor is a required input.
-        tokenizer ([`MgpstrTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         self.char_tokenizer = tokenizer
         self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
@@ -57,13 +46,8 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
-        [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
-        arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
-        refer to the docstring of the above methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 7c1148f19cf3..7d3b31b69eac 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -23,6 +23,7 @@
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class MllamaProcessorKwargs(ProcessingKwargs, total=False):
@@ -166,38 +167,8 @@ def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> st
     return f"{image_token * num_image_tokens_on_start}{bos_token}{prompt}"
 
 
+@auto_docstring
 class MllamaProcessor(ProcessorMixin):
-    r"""
-    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
-    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import MllamaProcessor
-        from PIL import Image
-
-        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")
-
-        processor(
-            images=your_pil_image,
-            text=["<|image|>If I had to write a haiku for this one"],
-            images_kwargs = {"size": {"height": 448, "width": 448}},
-            text_kwargs = {"padding": "right"},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        ```
-
-    Args:
-        image_processor ([`MllamaImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-
-    """
-
     def __init__(self, image_processor, tokenizer, chat_template=None):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = "<|image|>"
@@ -211,6 +182,7 @@ def __init__(self, image_processor, tokenizer, chat_template=None):
         self.bos_token = tokenizer.bos_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -218,24 +190,6 @@ def __call__(
         **kwargs: Unpack[MllamaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
-        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` arguments to
-        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
index 228253e20993..47bed76cbacd 100644
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -22,35 +22,19 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import to_numpy
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class MusicgenProcessor(ProcessorMixin):
-    r"""
-    Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
-    class.
-
-    [`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
-    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`EncodecFeatureExtractor`):
-            An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`T5Tokenizer`):
-            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
-        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         if len(args) > 0:
             kwargs["audio"] = args[0]
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
index 49092f80cd45..f0422c3a17f7 100644
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@@ -22,25 +22,13 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import to_numpy
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
 @requires(backends=("torchaudio",))
+@auto_docstring
 class MusicgenMelodyProcessor(ProcessorMixin):
-    r"""
-    Constructs a MusicGen Melody processor which wraps a Wav2Vec2 feature extractor - for raw audio waveform processing - and a T5 tokenizer into a single processor
-    class.
-
-    [`MusicgenProcessor`] offers all the functionalities of [`MusicgenMelodyFeatureExtractor`] and [`T5Tokenizer`]. See
-    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`MusicgenMelodyFeatureExtractor`):
-            An instance of [`MusicgenMelodyFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`T5Tokenizer`):
-            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -48,13 +36,8 @@ def __init__(self, feature_extractor, tokenizer):
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
-        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
-
         if len(args) > 0:
             kwargs["audio"] = args[0]
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 4e071dbd8109..e11559ac70c1 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -22,25 +22,15 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import PaddingStrategy, TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class NougatProcessor(ProcessorMixin):
-    r"""
-    Constructs a Nougat processor which wraps a Nougat image processor and a Nougat tokenizer into a single processor.
-
-    [`NougatProcessor`] offers all the functionalities of [`NougatImageProcessor`] and [`NougatTokenizerFast`]. See the
-    [`~NougatProcessor.__call__`] and [`~NougatProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`NougatImageProcessor`]):
-            An instance of [`NougatImageProcessor`]. The image processor is a required input.
-        tokenizer ([`NougatTokenizerFast`]):
-            An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images=None,
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 3601655e3e99..1d79179f3697 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -29,6 +29,7 @@
     is_torch_available,
     is_torchvision_available,
 )
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -199,51 +200,18 @@ def _post_process_boxes_for_image(
 
 
 @requires(backends=("vision", "torchvision"))
+@auto_docstring
 class OmDetTurboProcessor(ProcessorMixin):
-    r"""
-    Constructs a OmDet-Turbo processor which wraps a Deformable DETR image processor and an AutoTokenizer into a
-    single processor.
-
-    [`OmDetTurboProcessor`] offers all the functionalities of [`DetrImageProcessor`] and
-    [`AutoTokenizer`]. See the docstring of [`~OmDetTurboProcessor.__call__`] and [`~OmDetTurboProcessor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`DetrImageProcessor`):
-            An instance of [`DetrImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[list[str], list[list[str]]]] = None,
         **kwargs: Unpack[OmDetTurboProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [*DetrImageProcessor.__call__] method to prepare image(s) for the model, and
-        [CLIPTokenizerFast.__call__] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            images (`ImageInput`):
-               Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255.
-            text (`Union[str, list[str], list[list[str]]]`):
-                The classes used to limit the scope of the open vocabulary detection. Expects a list of strings or a list
-                of list of strings. Batched classes can be of different lengths.
-                Examples: ["cat", "dog", "bird"], [["cat", "dog", "bird"], ["hat", "person"], ["car"]]
-        Kwargs:
-            task (`Union[str, list[str], TextInput, PreTokenizedInput]`):
-                The grounded text used to guide open vocabulary detection. Expects a single string or a list of strings.
-                Examples: "Detect a cat, a dog, and a bird.",[ "Detect everything.", "Detect trees and flowers."]
-                When not provided, the default task is "Detect [class1], [class2], [class3]" etc.
-            ...
-        """
         if images is None or text is None:
             raise ValueError("You have to specify both `images` and `text`")
 
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
index ec90d63f7bd7..172ff7e7b029 100644
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -18,32 +18,24 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
     import torch
 
 
+@auto_docstring
 class OneFormerProcessor(ProcessorMixin):
-    r"""
-    Constructs an OneFormer processor which wraps [`OneFormerImageProcessor`] and
-    [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities.
-
-    Args:
-        image_processor ([`OneFormerImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
-            The tokenizer is a required input.
+    def __init__(
+        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
+    ):
+        """
         max_seq_len (`int`, *optional*, defaults to 77)):
             Sequence length for input text list.
         task_seq_len (`int`, *optional*, defaults to 77):
             Sequence length for input task token.
-    """
-
-    def __init__(
-        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
-    ):
+        """
         self.max_seq_length = max_seq_length
         self.task_seq_length = task_seq_length
 
@@ -65,32 +57,20 @@ def _preprocess_text(self, text_list=None, max_length=77):
         token_inputs = torch.cat(token_inputs, dim=0)
         return token_inputs
 
+    @auto_docstring
     def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
         """
-        Main method to prepare for the model one or several task input(s) and image(s). This method forwards the
-        `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
-        `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
-        docstring of the above two methods for more information.
-
-        Args:
-            task_inputs (`str`, `list[str]`):
-                The sequence or batch of task_inputs sequences to be encoded. Each sequence can be a string or a list
-                of strings of the template "the task is {task}".
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            segmentation_maps (`ImageInput`, *optional*):
-                The corresponding semantic segmentation maps with the pixel-wise annotations.
-
-             (`bool`, *optional*, defaults to `True`):
-                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
-
-                If left to the default, will return a pixel mask that is:
-
-                - 1 for pixels that are real (i.e. **not masked**),
-                - 0 for pixels that are padding (i.e. **masked**).
+        segmentation_maps (`ImageInput`, *optional*):
+            The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+            (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+            If left to the default, will return a pixel mask that is:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **task_inputs** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
diff --git a/src/transformers/models/ovis2/processing_ovis2.py b/src/transformers/models/ovis2/processing_ovis2.py
index f67657c140d8..e127e96bfc0c 100644
--- a/src/transformers/models/ovis2/processing_ovis2.py
+++ b/src/transformers/models/ovis2/processing_ovis2.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -34,26 +35,8 @@ class Ovis2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Ovis2Processor(ProcessorMixin):
-    r"""
-    Constructs a Ovis2 processor which wraps Ovis2 image processor and a Qwen2 tokenizer into a single processor.
-
-    [`Ovis2Processor`] offers all the functionalities of [`Ovis2VideoProcessor`], [`Ovis2ImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Ovis2Processor.__call__`] and [`~Ovis2Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Ovis2ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        image_seq_length (`int`, *optional*, defaults to 256):
-            The number of image tokens to be used for each image in the input.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -63,6 +46,12 @@ def __init__(
         image_seq_length=256,
         **kwargs,
     ):
+        """
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image tokens to be used for each image in the input.
+        """
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.image_token_id = (
@@ -72,6 +61,7 @@ def __init__(
         )
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -79,21 +69,6 @@ def __call__(
         **kwargs: Unpack[Ovis2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        Ovis2ImageProcessor's [`~Ovis2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 52889721820f..2ef674c971a1 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -53,23 +54,13 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Owlv2Processor(ProcessorMixin):
-    r"""
-    Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`]/[`Owlv2ImageProcessorFast`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
-    a single processor that inherits both the image processor and tokenizer functionalities. See the
-    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Owlv2ImageProcessor`, `Owlv2ImageProcessorFast`]):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
 
     # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -77,29 +68,10 @@ def __call__(
         **kwargs: Unpack[Owlv2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
-        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The query image to be prepared, one query image is expected per target image to be queried. Each image
-                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
-                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The query image to be prepared, one query image is expected per target image to be queried. Each image
+            can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+            should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 0443ab64eda9..2e825d7d4bfb 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -53,22 +54,12 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class OwlViTProcessor(ProcessorMixin):
-    r"""
-    Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
-    into a single processor that inherits both the image processor and tokenizer functionalities. See the
-    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`OwlViTImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -76,36 +67,17 @@ def __call__(
         **kwargs: Unpack[OwlViTProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
-        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The query image to be prepared, one query image is expected per target image to be queried. Each image
-                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
-                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The query image to be prepared, one query image is expected per target image to be queried. Each image
+            can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+            should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
+            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+            `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
         """
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 0c28f0eb4631..d41b096f2952 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -40,6 +41,12 @@
 
 
 class PaliGemmaTextKwargs(TextKwargs):
+    """
+    suffix (`str`, `list[str]`, `list[list[str]]`):
+        The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
+        for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
+    """
+
     suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
 
 
@@ -92,22 +99,8 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
     return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
 
 
+@auto_docstring
 class PaliGemmaProcessor(ProcessorMixin):
-    r"""
-    Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
-
-    [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`GemmaTokenizerFast`]. See the
-    [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`GemmaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -136,6 +129,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -168,24 +162,6 @@ def __call__(
         Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
 
 
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-            suffix (`str`, `list[str]`, `list[list[str]]`):
-                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
-                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/parakeet/processing_parakeet.py b/src/transformers/models/parakeet/processing_parakeet.py
index 9d69f1458b60..465711bc8384 100644
--- a/src/transformers/models/parakeet/processing_parakeet.py
+++ b/src/transformers/models/parakeet/processing_parakeet.py
@@ -18,6 +18,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -38,10 +39,12 @@ class ParakeetProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ParakeetProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: AudioInput,
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 412996873807..71e3a01bfdea 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -25,6 +25,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -40,28 +41,8 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class PerceptionLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor.
-
-    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the
-    [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
-
-    Args:
-        video_processor ([`PerceptionLMVideoProcessor`], *optional*):
-            The video processor to process video inputs.
-        image_processor ([`PerceptionLMImageProcessorFast`], *optional*):
-            The image processor to process image inputs.
-        tokenizer ([`LlamaTokenizerFast`] or similar, *optional*):
-            The tokenizer to process text inputs.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-        pooling_ratio (`int`, *optional*, defaults to 2):
-            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
-    """
-
     def __init__(
         self,
         video_processor=None,
@@ -72,6 +53,12 @@ def __init__(
         pooling_ratio=2,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        pooling_ratio (`int`, *optional*, defaults to 2):
+            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
+        """
         self.patch_size = patch_size
         self.pooling_ratio = pooling_ratio
         self.image_token = tokenizer.image_token
@@ -80,6 +67,7 @@ def __init__(
         self.video_token_id = tokenizer.video_token_id
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -88,25 +76,6 @@ def __call__(
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
     ) -> BatchFeature:
         """
-        Prepares a batch containing one or more sequences of text and/or images and/or videos.
-
-        If `text` is provided, it is tokenized using the tokenizer.
-        If `images` is provided, they are processed using the image processor.
-        If `videos` is provided, they are processed using the video processor.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor.
-                Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, *optional*):
-                The sequence or batch of sequences to be tokenized. Each sequence can be a string.
-            videos (`Any`, *optional*):
-                The video or batch of videos to be processed.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
index 8eec69b0448e..8e7782f3b7f5 100644
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -38,26 +39,8 @@ class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Phi4MultimodalProcessor(ProcessorMixin):
-    r"""
-    Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
-
-    [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
-    [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Phi4MultimodalImageProcessorFast`):
-            The image processor to use for images.
-        audio_processor (`Phi4MultimodalFeatureExtractor`):
-            The audio processor to use for audio inputs.
-        tokenizer (`GPT2TokenizerFast`):
-            The tokenizer to use for text.
-        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
-            The fake image token pattern.
-        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
-            The fake audio token pattern.
-    """
-
     audio_processor_class = "Phi4MultimodalFeatureExtractor"
 
     def __init__(
@@ -67,12 +50,19 @@ def __init__(
         tokenizer,
         **kwargs,
     ):
+        r"""
+        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
+            The fake image token pattern.
+        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
+            The fake audio token pattern.
+        """
         self.image_token = tokenizer.image_token
         self.image_token_id = tokenizer.image_token_id
         self.audio_token = tokenizer.audio_token
         self.audio_token_id = tokenizer.audio_token_id
         super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, list[TextInput]],
@@ -81,23 +71,6 @@ def __call__(
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
-        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            audio (`list[Union[np.ndarray, torch.Tensor]]`):
-                List of the audios to be prepared.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index b7446cb69684..e260c66fbf58 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -22,6 +22,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
@@ -46,37 +47,19 @@ class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class Pix2StructProcessor(ProcessorMixin):
-    r"""
-    Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
-    processor.
-
-    [`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5TokenizerFast`]. See
-    the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Pix2StructImageProcessor`):
-            An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images=None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[Pix2StructProcessorKwargs],
     ) -> Union[BatchEncoding, BatchFeature]:
-        """
-        This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`T5TokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index b62deee98300..4ebb527dc23a 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_vision_available():
@@ -61,32 +62,8 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
+@auto_docstring
 class PixtralProcessor(ProcessorMixin):
-    r"""
-    Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
-
-    [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`PixtralImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*, defaults to 16):
-            Patch size from the vision tower.
-        spatial_merge_size (`int`, *optional*, defaults to 1):
-            The downsampling factor for the spatial merge operation.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"[IMG]"`):
-            Special token used to denote image location.
-        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
-            Special token used to denote the end of a line of pixels in an image.
-        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
-            Special token used to denote the end of an image input.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -99,6 +76,18 @@ def __init__(
         image_end_token="[IMG_END]",
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size from the vision tower.
+        spatial_merge_size (`int`, *optional*, defaults to 1):
+            The downsampling factor for the spatial merge operation.
+        image_token (`str`, *optional*, defaults to `"[IMG]"`):
+            Special token used to denote image location.
+        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+            Special token used to denote the end of a line of pixels in an image.
+        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
+            Special token used to denote the end of an image input.
+        """
         self.patch_size = patch_size
         self.spatial_merge_size = spatial_merge_size
         self.image_token = image_token
@@ -111,6 +100,7 @@ def __init__(
         self.image_ids = [self.image_token_id, self.image_break_token_id, self.image_end_token_id]
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -118,26 +108,6 @@ def __call__(
         **kwargs: Unpack[PixtralProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
index a68168e36739..0d40fe7e021b 100644
--- a/src/transformers/models/pop2piano/processing_pop2piano.py
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -23,28 +23,17 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
 @requires(backends=("essentia", "librosa", "pretty_midi", "scipy", "torch"))
+@auto_docstring
 class Pop2PianoProcessor(ProcessorMixin):
-    r"""
-    Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single
-    processor.
-
-    [`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`].
-    See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`Pop2PianoFeatureExtractor`):
-            An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`Pop2PianoTokenizer`):
-            An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Union[np.ndarray, list[float], list[np.ndarray]] = None,
@@ -59,13 +48,6 @@ def __call__(
         verbose: bool = True,
         **kwargs,
     ) -> Union[BatchFeature, BatchEncoding]:
-        """
-        This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model,
-        and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-
         # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and
         # feature_extractor_output, we must check for both.
         if (audio is None and sampling_rate is None) and (notes is None):
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index ead9dbe10da4..b4b4d9a10926 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -27,6 +27,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -70,25 +71,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2_5OmniProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5Omni processor.
-    [`Qwen2_5OmniProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2_5OmniProcessor.__call__`] and [`~Qwen2_5OmniProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor.
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The audio feature extractor.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The text tokenizer.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
-    """
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
@@ -101,6 +85,7 @@ def __init__(
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -109,30 +94,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs: Unpack[Qwen2_5OmniProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. To prepare the vision inputs,
-        this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
-        if `vision_infos` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            audio (`np.ndarray`, `list[np.ndarray]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array.
-        """
-
         if text is None:
             raise ValueError("You need to specify either a `text` input to process.")
 
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 8734f56b9418..1b24da65ed99 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -32,6 +32,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,22 +45,8 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2_5_VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -75,6 +62,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -83,27 +71,6 @@ def __call__(
         **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index 449480df4588..d164d6f7fd65 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -23,6 +23,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,29 +35,8 @@ class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2AudioProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
-
-    [`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The feature extractor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`Optional[str]`, *optional*):
-                The Jinja template to use for formatting the conversation. If not provided, the default chat template
-                is used.
-        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
-            The token to use for audio tokens.
-        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
-            The token to use for audio bos tokens.
-        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
-            The token to use for audio eos tokens.
-    """
-
     def __init__(
         self,
         feature_extractor=None,
@@ -66,6 +46,14 @@ def __init__(
         audio_bos_token="<|audio_bos|>",
         audio_eos_token="<|audio_eos|>",
     ):
+        """
+        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
+            The token to use for audio tokens.
+        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
+            The token to use for audio bos tokens.
+        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
+            The token to use for audio eos tokens.
+        """
         if chat_template is None:
             chat_template = self.default_chat_template
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
@@ -74,27 +62,13 @@ def __init__(
         self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         audio: Union[np.ndarray, list[np.ndarray]] = None,
         **kwargs: Unpack[Qwen2AudioProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audio (`np.ndarray`, `list[np.ndarray]`):
-                The audio or batch of audios to be prepared. Each audio can be a NumPy array.
-        """
         if text is None:
             raise ValueError("You need to specify `text` input to process.")
         elif isinstance(text, str):
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index e9487a8197bf..4719b8f060ff 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -30,6 +30,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -45,22 +46,8 @@ class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen2VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2VLProcessor.__call__`] and [`~Qwen2VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -76,6 +63,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -84,27 +72,6 @@ def __call__(
         **kwargs: Unpack[Qwen2VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index ceacd2b854d2..ed8a3356286f 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -29,6 +29,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, VideosKwargs
 from ...tokenization_utils_base import TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput, make_batched_videos
 
 
@@ -83,25 +84,8 @@ def _get_feat_extract_output_lengths(input_lengths):
     return output_lengths
 
 
+@auto_docstring
 class Qwen3OmniMoeProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5Omni processor.
-    [`Qwen3OmniMoeProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen3OmniMoeProcessor.__call__`] and [`~Qwen3OmniMoeProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor.
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The audio feature extractor.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The text tokenizer.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
-    """
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
@@ -114,6 +98,7 @@ def __init__(
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
 
+    @auto_docstring
     def __call__(
         self,
         text: TextInput = None,
@@ -122,30 +107,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs,
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. To prepare the vision inputs,
-        this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
-        if `vision_infos` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            audio (`np.ndarray`, `List[np.ndarray]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array.
-        """
-
         if text is None:
             raise ValueError("You need to specify either a `text` input to process.")
 
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
index b86e3c282ed4..1f2a5c891d3d 100644
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,22 +45,8 @@ class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen3VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen3VL processor which wraps a Qwen3VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen3VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen3VLProcessor.__call__`] and [`~Qwen3VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen3VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -91,6 +78,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
             else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: ImageInput = None,
@@ -99,27 +87,6 @@ def __call__(
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index cda5d7ed5aeb..3dcc41b0e5fa 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -52,33 +53,19 @@ class SamProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SamProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
-    """
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         self.target_size = self.image_processor.size["longest_edge"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-        """
         output_kwargs = self._merge_kwargs(
             SamProcessorKwargs,
             tokenizer_init_kwargs={},
diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py
index 21a5f9dc5913..1eb6c06bcae4 100644
--- a/src/transformers/models/sam2/processing_sam2.py
+++ b/src/transformers/models/sam2/processing_sam2.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType, is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -35,28 +36,18 @@
 
 
 @requires(backends=("torch",))
+@auto_docstring
 class Sam2Processor(ProcessorMixin):
-    r"""
-    Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`Sam2Processor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
-        target_size (`int`, *optional*):
-            The target size (target_size, target_size) to which the image will be resized.
+    def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
+        """
         point_pad_value (`int`, *optional*, defaults to -10):
             The value used for padding input points.
-    """
-
-    def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
+        """
         super().__init__(image_processor, **kwargs)
         self.point_pad_value = point_pad_value
         self.target_size = target_size if target_size is not None else self.image_processor.size["height"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -68,27 +59,17 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        r"""
-        This method uses [`Sam2ImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-
-        Args:
-            images (`ImageInput`, *optional*):
-                The image(s) to process.
-            segmentation_maps (`ImageInput`, *optional*):
-                The segmentation maps to process.
-            input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
-                The points to add to the frame.
-            input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
-                The labels for the points.
-            input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
-                The bounding boxes to add to the frame.
-            original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
-                The original sizes of the images.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return.
-            **kwargs:
-                Additional keyword arguments to pass to the image processor.
+        """
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to process.
+        input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
+            The points to add to the frame.
+        input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
+            The labels for the points.
+        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
+            The bounding boxes to add to the frame.
+        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
+            The original sizes of the images.
 
         Returns:
             A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py
index 839449ba505d..145e07d02374 100644
--- a/src/transformers/models/sam2_video/processing_sam2_video.py
+++ b/src/transformers/models/sam2_video/processing_sam2_video.py
@@ -28,38 +28,27 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 from ...video_utils import VideoInput
 from .modeling_sam2_video import Sam2VideoInferenceSession
 
 
 @requires(backends=("torch",))
+@auto_docstring
 class Sam2VideoProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
-        video_processor (`Sam2VideoVideoProcessor`):
-            An instance of [`Sam2VideoVideoProcessor`].
-        target_size (`int`, *optional*):
-            The target size (target_size, target_size) to which the image will be resized.
-        point_pad_value (`int`, *optional*, defaults to -10):
-            The value used for padding input points.
-    """
-
     def __init__(
         self, image_processor, video_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs
     ):
+        """
+        point_pad_value (`int`, *optional*, defaults to -10):
+            The value used for padding input points.
+        """
         super().__init__(image_processor, video_processor, **kwargs)
         self.point_pad_value = point_pad_value
         self.target_size = target_size if target_size is not None else self.image_processor.size["height"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -71,27 +60,17 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        r"""
-        This method uses [`Sam2VideoImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-
-        Args:
-            images (`ImageInput`, *optional*):
-                The image(s) to process.
-            segmentation_maps (`ImageInput`, *optional*):
-                The segmentation maps to process.
-            input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
-                The points to add to the frame.
-            input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
-                The labels for the points.
-            input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
-                The bounding boxes to add to the frame.
-            original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
-                The original sizes of the images.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return.
-            **kwargs:
-                Additional keyword arguments to pass to the image processor.
+        """
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to process.
+        input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
+            The points to add to the frame.
+        input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
+            The labels for the points.
+        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
+            The bounding boxes to add to the frame.
+        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
+            The original sizes of the images.
 
         Returns:
             A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 1434a9ca5a2d..df09ff832e75 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -23,8 +23,9 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import BatchEncoding
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -52,19 +53,8 @@ class SamHQProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SamHQProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM HQ processor which wraps a SAM  image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`SamHQProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
-    """
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         # Ensure image_processor is properly initialized
@@ -74,16 +64,12 @@ def __init__(self, image_processor):
             raise ValueError("image_processor.size is not set")
         self.target_size = self.image_processor.size["longest_edge"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs: Unpack[SamHQProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-        """
         output_kwargs = self._merge_kwargs(
             SamHQProcessorKwargs,
             tokenizer_init_kwargs={},
diff --git a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
index a506d81af61d..0ed8dde57060 100644
--- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
@@ -22,6 +22,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
@@ -38,28 +39,15 @@ class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class SeamlessM4TProcessor(ProcessorMixin):
-    r"""
-    Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a
-    single processor.
-
-    [`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and
-    [`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for
-    more information.
-
-    Args:
-        feature_extractor ([`SeamlessM4TFeatureExtractor`]):
-            The audio processor is a required input.
-        tokenizer ([`SeamlessM4TTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = SeamlessM4TProcessorKwargs
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
@@ -68,21 +56,6 @@ def __call__(
         **kwargs: Unpack[ProcessingKwargs],
     ):
         """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
-        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
-        SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
-                and T the sample length of the audio.
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index 2d63eacd1747..6976a0b3f7b6 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class SiglipProcessor(ProcessorMixin):
-    r"""
-    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
-
-    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
-    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`SiglipTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index fe33ad11dbe7..0739af8a92ed 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,20 +34,8 @@ class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Siglip2Processor(ProcessorMixin):
-    r"""
-    Constructs a Siglip2 processor which wraps a Siglip2 image processor and a Gemma tokenizer into a single processor.
-
-    [`Siglip2Processor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`GemmaTokenizerFast`]. See the
-    [`~Siglip2Processor.__call__`] and [`~Siglip2Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Siglip2ImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`GemmaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = Siglip2ProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 2ce6465ee971..605db74d3c53 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -24,6 +24,7 @@
 from ...processing_utils import AllKwargsForChatTemplate, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -119,28 +120,8 @@ class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SmolVLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a SmolVLM processor which wraps a LLama tokenizer and SmolVLM image processor into a single processor.
-
-    [`SmolVLMProcessor`] offers all the functionalities of [`SmolVLMImageProcessor`] and [`SmolVLMTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`SmolVLMImageProcessor`):
-            An instance of [`SmolVLMImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        video_processor (`SmolVLMImageProcessor`):
-            n instance of [`SmolVLMImageProcessor`]. The video processor is a required input.
-        image_seq_len (`int`, *optional*, defaults to 169):
-            The length of the image sequence i.e. the number of <image> tokens per image in the input.
-            This parameter is used to build the string from the input prompt and image tokens and should match the
-            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -150,6 +131,12 @@ def __init__(
         chat_template: Optional[str] = None,
         **kwargs,
     ):
+        """
+        image_seq_len (`int`, *optional*, defaults to 169):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
+        """
         self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
         self.image_token = getattr(tokenizer, "image_token", "<image>")
         self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -230,6 +217,7 @@ def expand_text_with_video_tokens(self, text, video_inputs):
             prompt_strings.append(sample)
         return prompt_strings
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
@@ -237,53 +225,6 @@ def __call__(
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[SmolVLMProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import SmolVLMProcessor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = SmolVLMProcessor.from_pretrained("HuggingFaceM4/SmolVLM2-256M-Video-Instruct")
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            videos (`list[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The video or batch of videos to be prepared. Each video can be a list of PIL frames, NumPy array or PyTorch
-                tensor. If is of type `list[VideoInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-        """
         if text is None and images is None and videos is None:
             raise ValueError("You must provide one of `text`, `images` or `videos'.")
 
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index ffcb4e3d4497..720a46812b1a 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -19,35 +19,16 @@
 import warnings
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class Speech2TextProcessor(ProcessorMixin):
-    r"""
-    Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
-    single processor.
-
-    [`Speech2TextProcessor`] offers all the functionalities of [`Speech2TextFeatureExtractor`] and
-    [`Speech2TextTokenizer`]. See the [`~Speech2TextProcessor.__call__`] and [`~Speech2TextProcessor.decode`] for more
-    information.
-
-    Args:
-        feature_extractor (`Speech2TextFeatureExtractor`):
-            An instance of [`Speech2TextFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`Speech2TextTokenizer`):
-            An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
-        [`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
-        [`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
-        [`~Speech2TextTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
diff --git a/src/transformers/models/speecht5/processing_speecht5.py b/src/transformers/models/speecht5/processing_speecht5.py
index bfac305ab641..baaf0b123436 100644
--- a/src/transformers/models/speecht5/processing_speecht5.py
+++ b/src/transformers/models/speecht5/processing_speecht5.py
@@ -15,25 +15,15 @@
 """Speech processor class for SpeechT5."""
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class SpeechT5Processor(ProcessorMixin):
-    r"""
-    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.
-
-    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
-    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`SpeechT5FeatureExtractor`):
-            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`SpeechT5Tokenizer`):
-            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
         """
         Processes audio and text input, as well as audio and text targets.
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 366bb0850d2d..06c6494e5569 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -22,42 +22,25 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class TrOCRProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class TrOCRProcessor(ProcessorMixin):
-    r"""
-    Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.
-
-    [`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
-    [`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
-    more information.
-
-    Args:
-        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
-            An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
-        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
-            An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs: Unpack[TrOCRProcessorKwargs],
     ) -> BatchFeature:
-        """
-        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
-        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
-        [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
-        [`~TrOCRTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 259246962d27..31e8444f1142 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class TvpProcessorKwargs(ProcessingKwargs, total=False):
@@ -30,20 +31,8 @@ class TvpProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class TvpProcessor(ProcessorMixin):
-    r"""
-    Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
-
-    [`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`TvpImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = image_processor
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 5e37a021e6ab..3c24ee6d08db 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -51,6 +52,7 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class UdopProcessor(ProcessorMixin):
     r"""
     Constructs a UDOP processor which combines a LayoutLMv3 image processor and a UDOP tokenizer into a single processor.
@@ -65,17 +67,12 @@ class UdopProcessor(ProcessorMixin):
 
     Additionally, it also supports passing `text_target` and `text_pair_target` to the tokenizer, which can be used to
     prepare labels for language modeling tasks.
-
-    Args:
-        image_processor (`LayoutLMv3ImageProcessor`):
-            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
-            An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
     """
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py
index d5ea2c75e9d8..64507f279149 100644
--- a/src/transformers/models/video_llama_3/processing_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -42,21 +43,8 @@ class VideoLlama3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class VideoLlama3Processor(ProcessorMixin):
-    r"""
-    Constructs a VideoLLaMA3 processor which wraps a VideoLLaMA3 image processor and a Qwen2 tokenizer into a single processor.
-    [`VideoLlama3Processor`] offers all the functionalities of [`VideoLlama3ImageProcessor`] and [`Qwen2Tokenizer`]. See the
-    [`~VideoLlama3Processor.__call__`] and [`~VideoLlama3Processor.decode`] for more information.
-    Args:
-        image_processor ([`VideoLlama3ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2Tokenizer`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`VideoLlama3VideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -72,6 +60,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: ImageInput = None,
@@ -80,27 +69,6 @@ def __call__(
         **kwargs: Unpack[VideoLlama3ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        VideoLlama3ImageProcessor's [`~VideoLlama3ImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 8d6d916834e8..2f249691817f 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -25,41 +25,14 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class VideoLlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor.
-
-    [`VideoLlavaProcessor`] offers all the functionalities of [`VideoLlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~VideoLlavaProcessor.__call__`] and [`~VideoLlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`VideoLlavaImageProcessor`], *optional*):
-            The image processor is a required input.
-        video_processor ([`VideoLlavaVideoProcessor`], *optional*):
-            The video processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*, defaults to 14):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_additional_image_tokens (`int`, *optional*, defaults to 1):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -73,6 +46,20 @@ def __init__(
         num_additional_image_tokens=1,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*, defaults to 14):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 1):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -82,6 +69,7 @@ def __init__(
         self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
         super().__init__(image_processor, video_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -93,43 +81,19 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch
-                tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is
-                number of channels, H and W are image height and width.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding
+            index) among:
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                sequence if provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        truncation (`bool`, *optional*):
+            Activates truncation to cut input sequences longer than `max_length` to `max_length`.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 26738d890d65..c1524c558637 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class ViltProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,20 +35,8 @@ class ViltProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ViltProcessor(ProcessorMixin):
-    r"""
-    Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
-
-    [`ViltProcessor`] offers all the functionalities of [`ViltImageProcessor`] and [`BertTokenizerFast`]. See the
-    docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`ViltImageProcessor`, *optional*):
-            An instance of [`ViltImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`, *optional*):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = ViltProcessorKwargs
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index cc1cd01e50c6..59ec8008366e 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -17,28 +17,15 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class VisionTextDualEncoderProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class VisionTextDualEncoderProcessor(ProcessorMixin):
-    r"""
-    Constructs a VisionTextDualEncoder processor which wraps an image processor and a tokenizer into a single
-    processor.
-
-    [`VisionTextDualEncoderProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`].
-    See the [`~VisionTextDualEncoderProcessor.__call__`] and [`~VisionTextDualEncoderProcessor.decode`] for more
-    information.
-
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index f07904a89690..d412906e240f 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union
 
 from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -61,19 +62,8 @@ class VoxtralProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class VoxtralProcessor(ProcessorMixin):
-    r"""
-    Constructs a Voxtral processor which wraps [`WhisperFeatureExtractor`] and
-    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`MistralCommonTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -226,28 +216,13 @@ def apply_chat_template(
 
         return encoded_instruct_inputs
 
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]],
         **kwargs: Unpack[VoxtralProcessorKwargs],
     ):
-        r"""
-        Method to prepare text to be fed as input to the model. This method forwards the `text`
-        arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.__call__`] to encode
-        the text. Please refer to the docstring of the above methods for more information.
-        This methods does not support audio. To prepare the audio, please use:
-        1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
-        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
+        """
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 8a8b7ded7116..e0f8e84140e9 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -21,6 +21,7 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
 from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
@@ -29,21 +30,8 @@ class Wav2Vec2ProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class Wav2Vec2Processor(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
-    processor.
-
-    [`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
-    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`Wav2Vec2FeatureExtractor`):
-            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -66,6 +54,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Optional[AudioInput] = None,
@@ -73,16 +62,6 @@ def __call__(
         **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
     ):
         """
-        This method forwards all arguments to [`Wav2Vec2FeatureExtractor.__call__`] and/or
-        [`PreTrainedTokenizer.__call__`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.__call__`] and [`PreTrainedTokenizer.__call__`] are called.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                An audio input is passed to [`Wav2Vec2FeatureExtractor.__call__`].
-            text (`str`, `List[str]`, *optional*):
-                A text input is passed to [`PreTrainedTokenizer.__call__`].
-
-
         Returns:
             This method returns the results of each `call` method. If both are used, the output is a dictionary containing the results of both.
         """
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index 90da8b651677..eb7f7c342967 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -21,6 +21,7 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
 from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
@@ -29,21 +30,8 @@ class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class Wav2Vec2BertProcessor(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
-    processor.
-
-    [`Wav2Vec2Processor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and [`PreTrainedTokenizer`].
-    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`SeamlessM4TFeatureExtractor`):
-            An instance of [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -66,6 +54,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Optional[AudioInput] = None,
@@ -73,21 +62,6 @@ def __call__(
         **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
     ):
         """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
-        and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
-        `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
-        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
-                and T the sample length of the audio.
-
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
             - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 71973334dfd6..e449354344fa 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -28,6 +28,7 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import ModelOutput, logging, requires_backends
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -66,26 +67,18 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
     word_offsets: Union[list[list[ListOfDict]], list[ListOfDict], ListOfDict] = None
 
 
+@auto_docstring
 class Wav2Vec2ProcessorWithLM(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
-    with language model support into a single processor for language model boosted speech recognition decoding.
-
-    Args:
-        feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]):
-            An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`Wav2Vec2CTCTokenizer`]):
-            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
-        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
-            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
-    """
-
     def __init__(
         self,
         feature_extractor: "FeatureExtractionMixin",
         tokenizer: "PreTrainedTokenizerBase",
         decoder: "BeamSearchDecoderCTC",
     ):
+        """
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
+        """
         from pyctcdecode import BeamSearchDecoderCTC
 
         super().__init__(feature_extractor, tokenizer)
@@ -214,14 +207,8 @@ def get_missing_alphabet_tokens(decoder, tokenizer):
 
         return missing_tokens
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to the feature extractor's
-        [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context
-        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
-        methods for more information.
-        """
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index e71a7a545281..2b3cf5c26f2a 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -17,35 +17,19 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class WhisperProcessor(ProcessorMixin):
-    r"""
-    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
-    processor.
-
-    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
-    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`WhisperFeatureExtractor`):
-            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`WhisperTokenizer`):
-            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
-        argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         audio = kwargs.pop("audio", None)
         sampling_rate = kwargs.pop("sampling_rate", None)
         text = kwargs.pop("text", None)
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index ae31cd075b8a..f1ab389acfa9 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class XCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
-
-    [`XCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = self.image_processor

From 89796459a3105e8354c12219ccaace7d3e1ae45a Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 7 Nov 2025 17:28:36 +0000
Subject: [PATCH 56/56] modifs after review

---
 .../models/auto/tokenization_auto.py          |  2 +-
 src/transformers/processing_utils.py          | 30 +++++++++----------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 084fa06bb7bc..fca5a16d21ab 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -676,7 +676,7 @@
             ),
         ),
         ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("smolvlm", ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
         ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
         ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ae78255c6ee1..053e1963ce60 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -130,11 +130,6 @@ def keys(self):
     "video_processor": "BaseVideoProcessor",
 }
 
-SPECIAL_MODULE_TO_MODEL_NAME_MAPPING = {
-    "kosmos2_5": "kosmos-2.5",
-    "kosmos2": "kosmos-2",
-}
-
 if sys.version_info >= (3, 11):
     Unpack = typing.Unpack
 else:
@@ -1441,26 +1436,29 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"):
     @classmethod
     def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         """
-        Identify and instantiate the subcomponents of Processor classes, like image processors and
-        tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those
-        subcomponents should be. Note that any subcomponents must either be library classes that are accessible in
-        the `transformers` root, or they must be custom code that has been registered with the relevant autoclass,
-        via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method
-        will be unable to find the relevant subcomponent class and will raise an error.
+        Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
+        and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
+        that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
+        "tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.)
+        from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like
+        parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument.
         """
         args = []
         # get args from processor init signature
         sub_processors = cls.get_attributes()
         for sub_processor_type in sub_processors:
-            if sub_processor_type not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in sub_processor_type:
+            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
+                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
+                sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                args.append(sub_processor)
+            elif "tokenizer" in sub_processor_type:
+                # Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer")
+                # Load using AutoTokenizer with subfolder
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
                 sub_processor = auto_processor_class.from_pretrained(
                     pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs
                 )
-            elif sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
-                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
-                sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-            args.append(sub_processor)
+                args.append(sub_processor)
 
         return args