From f48a47bdda5ed044b7634181558cacf2f29d5757 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Wed, 15 Oct 2025 15:47:21 +0000 Subject: [PATCH 01/56] remove attributes and add all missing sub processors to their auto classes --- .../models/align/processing_align.py | 3 - .../models/altclip/processing_altclip.py | 4 - .../models/aria/processing_aria.py | 4 - .../models/auto/feature_extraction_auto.py | 9 + .../models/auto/image_processing_auto.py | 17 +- .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 26 +++ .../models/auto/video_processing_auto.py | 1 + .../aya_vision/processing_aya_vision.py | 4 - .../models/bark/processing_bark.py | 1 - .../models/blip/processing_blip.py | 4 - .../models/blip_2/processing_blip_2.py | 4 - .../bridgetower/processing_bridgetower.py | 3 - .../models/bros/processing_bros.py | 1 - .../models/chameleon/processing_chameleon.py | 4 - .../chinese_clip/processing_chinese_clip.py | 4 - .../models/clap/processing_clap.py | 3 - .../models/clip/processing_clip.py | 4 - .../models/clipseg/processing_clipseg.py | 4 - .../models/clvp/processing_clvp.py | 3 - .../processing_cohere2_vision.py | 4 - .../models/colpali/processing_colpali.py | 4 - .../models/colqwen2/processing_colqwen2.py | 5 - src/transformers/models/csm/processing_csm.py | 4 - .../deepseek_vl/processing_deepseek_vl.py | 3 - .../processing_deepseek_vl_hybrid.py | 3 - src/transformers/models/dia/processing_dia.py | 2 - .../models/donut/processing_donut.py | 4 - .../models/emu3/processing_emu3.py | 4 - .../models/evolla/processing_evolla.py | 1 - .../models/flava/processing_flava.py | 4 - .../models/florence2/processing_florence2.py | 4 - .../models/fuyu/processing_fuyu.py | 4 - .../models/gemma3/processing_gemma3.py | 4 - .../models/gemma3n/processing_gemma3n.py | 5 - src/transformers/models/git/processing_git.py | 4 - .../models/glm4v/processing_glm4v.py | 6 - .../models/got_ocr2/processing_got_ocr2.py | 4 - .../processing_granite_speech.py | 1 - .../processing_grounding_dino.py | 3 - .../models/idefics/processing_idefics.py | 4 - .../models/idefics2/processing_idefics2.py | 4 - .../models/idefics3/processing_idefics3.py | 4 - .../instructblip/processing_instructblip.py | 3 - .../processing_instructblipvideo.py | 5 - .../models/internvl/processing_internvl.py | 5 - .../models/janus/processing_janus.py | 4 - .../models/kosmos2/processing_kosmos2.py | 4 - .../models/kosmos2_5/processing_kosmos2_5.py | 4 - .../processing_kyutai_speech_to_text.py | 2 - .../layoutlmv2/processing_layoutlmv2.py | 4 - .../layoutlmv3/processing_layoutlmv3.py | 4 - .../models/layoutxlm/processing_layoutxlm.py | 4 - .../models/lfm2_vl/processing_lfm2_vl.py | 4 - .../models/llama4/processing_llama4.py | 4 - .../models/llava/processing_llava.py | 4 - .../llava_next/processing_llava_next.py | 4 - .../processing_llava_next_video.py | 5 - .../processing_llava_onevision.py | 5 - .../models/markuplm/processing_markuplm.py | 2 - .../models/mgp_str/processing_mgp_str.py | 2 - .../models/mllama/processing_mllama.py | 4 - .../models/musicgen/processing_musicgen.py | 3 - .../processing_musicgen_melody.py | 3 - .../models/nougat/processing_nougat.py | 4 - .../omdet_turbo/processing_omdet_turbo.py | 4 - .../models/oneformer/processing_oneformer.py | 4 - .../models/ovis2/processing_ovis2.py | 4 - .../models/owlv2/processing_owlv2.py | 4 - .../models/owlvit/processing_owlvit.py | 4 - .../models/paligemma/processing_paligemma.py | 4 - .../models/parakeet/processing_parakeet.py | 4 - .../perception_lm/processing_perception_lm.py | 5 - .../processing_phi4_multimodal.py | 3 - .../pix2struct/processing_pix2struct.py | 4 - .../models/pixtral/processing_pixtral.py | 4 - .../models/pop2piano/processing_pop2piano.py | 4 - .../qwen2_5_omni/processing_qwen2_5_omni.py | 6 - .../qwen2_5_vl/processing_qwen2_5_vl.py | 6 - .../qwen2_audio/processing_qwen2_audio.py | 4 - .../models/qwen2_vl/processing_qwen2_vl.py | 5 - .../processing_qwen3_omni_moe.py | 6 - .../models/qwen3_vl/processing_qwen3_vl.py | 5 - src/transformers/models/sam/processing_sam.py | 3 - .../models/sam2/processing_sam2.py | 3 - .../sam2_video/processing_sam2_video.py | 4 - .../models/sam_hq/processing_samhq.py | 3 - .../seamless_m4t/processing_seamless_m4t.py | 2 - .../models/siglip/processing_siglip.py | 4 - .../models/siglip2/processing_siglip2.py | 4 - .../models/smolvlm/processing_smolvlm.py | 5 - .../processing_speech_to_text.py | 3 - .../models/speecht5/processing_speecht5.py | 3 - .../models/trocr/processing_trocr.py | 4 - src/transformers/models/tvp/processing_tvp.py | 4 - .../models/udop/processing_udop.py | 4 - .../video_llama_3/processing_video_llama_3.py | 5 - .../video_llava/processing_video_llava.py | 5 - .../models/vilt/processing_vilt.py | 3 - .../processing_vision_text_dual_encoder.py | 4 - .../models/voxtral/processing_voxtral.py | 4 - .../models/wav2vec2/processing_wav2vec2.py | 3 - .../wav2vec2_bert/processing_wav2vec2_bert.py | 3 - .../processing_wav2vec2_with_lm.py | 3 - .../models/whisper/processing_whisper.py | 3 - .../models/x_clip/processing_x_clip.py | 8 +- src/transformers/processing_utils.py | 157 +++++++++++------- .../wav2vec2/test_processing_wav2vec2.py | 7 +- .../test_processing_wav2vec2_bert.py | 7 +- .../test_processing_wav2vec2_with_lm.py | 16 +- tests/test_processing_common.py | 122 +++++--------- 111 files changed, 213 insertions(+), 535 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index fbca27b2ff39..ac927b8d2306 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -59,9 +59,6 @@ class AlignProcessor(ProcessorMixin): """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "EfficientNetImageProcessor" - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = AlignProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 24631ecacbd7..933a5e48dfed 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -35,10 +35,6 @@ class AltCLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") - tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast") - @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor") def __init__(self, image_processor=None, tokenizer=None): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index d0841c96aee2..c29c289649da 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -67,10 +67,6 @@ class AriaProcessor(ProcessorMixin): A dictionary indicating size conversions for images. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AriaImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 38f09a5a3ee8..0f0b6660fc36 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -42,6 +42,7 @@ ("audio-spectrogram-transformer", "ASTFeatureExtractor"), ("clap", "ClapFeatureExtractor"), ("clvp", "ClvpFeatureExtractor"), + ("csm", "EncodecFeatureExtractor"), ("dac", "DacFeatureExtractor"), ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("dia", "DiaFeatureExtractor"), @@ -50,14 +51,21 @@ ("granite_speech", "GraniteSpeechFeatureExtractor"), ("hubert", "Wav2Vec2FeatureExtractor"), ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"), + ("markuplm", "MarkupLMFeatureExtractor"), ("mctct", "MCTCTFeatureExtractor"), ("mimi", "EncodecFeatureExtractor"), ("moonshine", "Wav2Vec2FeatureExtractor"), ("moshi", "EncodecFeatureExtractor"), + ("musicgen", "EncodecFeatureExtractor"), + ("musicgen_melody", "MusicgenMelodyFeatureExtractor"), + ("parakeet", "ParakeetFeatureExtractor"), ("parakeet_ctc", "ParakeetFeatureExtractor"), ("parakeet_encoder", "ParakeetFeatureExtractor"), ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"), ("pop2piano", "Pop2PianoFeatureExtractor"), + ("qwen2_5_omni", "WhisperFeatureExtractor"), + ("qwen2_audio", "WhisperFeatureExtractor"), + ("qwen3_omni_moe", "WhisperFeatureExtractor"), ("seamless_m4t", "SeamlessM4TFeatureExtractor"), ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"), ("sew", "Wav2Vec2FeatureExtractor"), @@ -67,6 +75,7 @@ ("unispeech", "Wav2Vec2FeatureExtractor"), ("unispeech-sat", "Wav2Vec2FeatureExtractor"), ("univnet", "UnivNetFeatureExtractor"), + ("voxtral", "WhisperFeatureExtractor"), ("wav2vec2", "Wav2Vec2FeatureExtractor"), ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"), ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index f864d107914f..4aeae5571c52 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -62,7 +62,9 @@ ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), + ("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("aria", ("AriaImageProcessor", None)), + ("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")), ("bit", ("BitImageProcessor", "BitImageProcessorFast")), ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")), @@ -73,6 +75,8 @@ ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")), ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")), + ("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")), + ("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")), ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), @@ -94,10 +98,13 @@ ("edgetam", (None, "Sam2ImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor", None)), ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")), + ("emu3", ("Emu3ImageProcessor", None)), ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")), ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")), - ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")), + ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast"))( + "focalnet", ("BitImageProcessor", "BitImageProcessorFast") + ), ("fuyu", ("FuyuImageProcessor", None)), ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")), ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")), @@ -115,11 +122,13 @@ ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")), ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), ("instructblipvideo", ("InstructBlipVideoImageProcessor", None)), + ("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")), ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")), ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), + ("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")), ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")), ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")), ("lightglue", ("LightGlueImageProcessor", None)), @@ -142,6 +151,7 @@ ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")), ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")), + ("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")), ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")), ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")), ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")), @@ -156,14 +166,17 @@ ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")), ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")), ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")), + ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), + ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), ("sam", ("SamImageProcessor", "SamImageProcessorFast")), ("sam2", (None, "Sam2ImageProcessorFast")), + ("sam2_video", (None, "Sam2ImageProcessorFast")), ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")), ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")), ("seggpt", ("SegGptImageProcessor", None)), @@ -181,12 +194,14 @@ ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")), ("timesformer", ("VideoMAEImageProcessor", None)), ("timm_wrapper", ("TimmWrapperImageProcessor", None)), + ("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")), ("tvlt", ("TvltImageProcessor", None)), ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")), ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")), ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")), + ("video_llava", ("VideoLlavaImageProcessor", None)), ("videomae", ("VideoMAEImageProcessor", None)), ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")), ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index cb2eb94cecd4..3c2230722ca9 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -108,6 +108,7 @@ ("mllama", "MllamaProcessor"), ("mm-grounding-dino", "GroundingDinoProcessor"), ("moonshine", "Wav2Vec2Processor"), + ("omdet-turbo", "OmDetTurboProcessor"), ("oneformer", "OneFormerProcessor"), ("ovis2", "Ovis2Processor"), ("owlv2", "Owlv2Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index ccee9937afa6..ed943b231ead 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -73,6 +73,7 @@ ), ), ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), @@ -157,6 +158,7 @@ ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), + ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), @@ -225,6 +227,7 @@ ), ), ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), + ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ( "dpr", ( @@ -239,6 +242,7 @@ ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), ("esm", ("EsmTokenizer", None)), + ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "exaone4", ( @@ -253,10 +257,13 @@ ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), ), ("flaubert", ("FlaubertTokenizer", None)), + ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("florence2", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), + ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "gemma", ( @@ -305,6 +312,7 @@ ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), @@ -315,6 +323,7 @@ ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)), ("granite", ("GPT2Tokenizer", None)), + ("granite_speech", ("GPT2Tokenizer", None)), ("granitemoe", ("GPT2Tokenizer", None)), ("granitemoehybrid", ("GPT2Tokenizer", None)), ("granitemoeshared", ("GPT2Tokenizer", None)), @@ -354,11 +363,14 @@ ), ), ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("kyutai_speech_to_text", (None, "PretrainedTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), + ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), ( "llama", @@ -399,6 +411,7 @@ ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), + ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)), ( "mbart", ( @@ -485,6 +498,7 @@ "NllbTokenizerFast" if is_tokenizers_available() else None, ), ), + ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)), ( "nystromformer", ( @@ -506,6 +520,7 @@ ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None), ), ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), @@ -531,6 +546,7 @@ None, ), ), + ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "persimmon", ( @@ -540,6 +556,7 @@ ), ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("phobert", ("PhobertTokenizer", None)), ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), @@ -553,6 +570,7 @@ ), ), ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), + ("pop2piano", ("Pop2PianoTokenizer", None)), ("prophetnet", ("ProphetNetTokenizer", None)), ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ( @@ -659,6 +677,7 @@ ), ), ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("smolvlm", ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), @@ -693,6 +712,7 @@ ("tapas", ("TapasTokenizer", None)), ("tapex", ("TapexTokenizer", None)), ("transfo-xl", ("TransfoXLTokenizer", None)), + ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ( "udop", @@ -708,9 +728,14 @@ "T5TokenizerFast" if is_tokenizers_available() else None, ), ), + ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ( + "vision_text_dual_encoder", + ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ), ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("vits", ("VitsTokenizer", None)), ( @@ -726,6 +751,7 @@ ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), + ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)), ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)), ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ( diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py index 78956269331f..c660b2e7a1ff 100644 --- a/src/transformers/models/auto/video_processing_auto.py +++ b/src/transformers/models/auto/video_processing_auto.py @@ -64,6 +64,7 @@ ("video_llava", "VideoLlavaVideoProcessor"), ("videomae", "VideoMAEVideoProcessor"), ("vjepa2", "VJEPA2VideoProcessor"), + ("video_llama_3", "VideoLlama3VideoProcessor"), # PLACEHOLDER - needs proper video processor class ] ) diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py index 882a85d40946..049b0e5d24eb 100644 --- a/src/transformers/models/aya_vision/processing_aya_vision.py +++ b/src/transformers/models/aya_vision/processing_aya_vision.py @@ -70,10 +70,6 @@ class AyaVisionProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 0602986483a6..65a1700837d9 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -49,7 +49,6 @@ class BarkProcessor(ProcessorMixin): """ - tokenizer_class = "AutoTokenizer" attributes = ["tokenizer"] preset_shape = { diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index f600e8ce27d8..965164206c5a 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -53,10 +53,6 @@ class BlipProcessor(ProcessorMixin): An instance of ['BertTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor, tokenizer, **kwargs): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 40729f4f4501..5949e2c648ce 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -60,10 +60,6 @@ class Blip2Processor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): tokenizer.return_token_type_ids = False if not hasattr(tokenizer, "image_token"): diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 030c578c49cd..5de97ec411dc 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -54,9 +54,6 @@ class BridgeTowerProcessor(ProcessorMixin): An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "BridgeTowerImageProcessor" - tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") valid_processor_kwargs = BridgeTowerProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py index 8de0a1c49b0d..fe58e17b12b6 100644 --- a/src/transformers/models/bros/processing_bros.py +++ b/src/transformers/models/bros/processing_bros.py @@ -47,7 +47,6 @@ class BrosProcessor(ProcessorMixin): """ attributes = ["tokenizer"] - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = BrosProcessorKwargs def __init__(self, tokenizer=None, **kwargs): diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 247f72322a2d..694be7ab8f26 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -69,10 +69,6 @@ class ChameleonProcessor(ProcessorMixin): The special token used to indicate image in the text. """ - attributes = ["image_processor", "tokenizer"] - tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") - image_processor_class = "ChameleonImageProcessor" - def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 0510b9b0f3c9..6508136f772e 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -34,10 +34,6 @@ class ChineseCLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast") - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py index 6524a8715841..a72151cb9b63 100644 --- a/src/transformers/models/clap/processing_clap.py +++ b/src/transformers/models/clap/processing_clap.py @@ -42,9 +42,6 @@ class ClapProcessor(ProcessorMixin): The tokenizer is a required input. """ - feature_extractor_class = "ClapFeatureExtractor" - tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 7b856f9981ee..9258d2e8fee3 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -33,10 +33,6 @@ class CLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 39e091106c71..4d431181cb4f 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -34,10 +34,6 @@ class CLIPSegProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast") - tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index 8fad43cd2f30..331589a23999 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -38,9 +38,6 @@ class ClvpProcessor(ProcessorMixin): An instance of [`ClvpTokenizer`]. The tokenizer is a required input. """ - feature_extractor_class = "ClvpFeatureExtractor" - tokenizer_class = "ClvpTokenizer" - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py index d4fcec4da875..b34fd1c5594e 100644 --- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -47,10 +47,6 @@ class Cohere2VisionProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index cd33607a35fd..1ad511ced7a7 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -93,10 +93,6 @@ class ColPaliProcessor(ProcessorMixin): A prefix to be used for the query. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast") - tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index 2eb9fed873a8..00f00c920856 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -64,11 +64,6 @@ class ColQwen2Processor(ProcessorMixin): query_prefix (`str`, *optional*): A prefix to be used for the query. """ - attributes = ["image_processor", "tokenizer"] - - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index 172016f6431d..d77ffeffd896 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -95,10 +95,6 @@ class CsmProcessor(ProcessorMixin): """ - attributes = ["feature_extractor", "tokenizer"] - feature_extractor_class = "EncodecFeatureExtractor" - tokenizer_class = "PreTrainedTokenizerFast" - def __init__( self, feature_extractor, diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py index ddeb4f799ee1..0ebf46ac5ad0 100644 --- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py @@ -52,10 +52,7 @@ class DeepseekVLProcessor(ProcessorMixin): The number of special image tokens used as placeholders for visual content in text sequences. """ - attributes = ["image_processor", "tokenizer"] valid_kwargs = ["chat_template", "num_image_tokens"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" def __init__( self, diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py index d20fa495f9b8..9049495932f6 100644 --- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py @@ -52,10 +52,7 @@ class DeepseekVLHybridProcessor(ProcessorMixin): The number of special image tokens used as placeholders for visual content in text sequences. """ - attributes = ["image_processor", "tokenizer"] valid_kwargs = ["chat_template", "num_image_tokens"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" def __init__( self, diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index 6518b5444639..23c04687308c 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -77,8 +77,6 @@ class DiaProcessor(ProcessorMixin): An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. """ - feature_extractor_class = "DiaFeatureExtractor" - tokenizer_class = "DiaTokenizer" audio_tokenizer_class = "DacModel" def __init__(self, feature_extractor, tokenizer, audio_tokenizer): diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 65ca58bcf781..fedd173117eb 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -48,10 +48,6 @@ class DonutProcessor(ProcessorMixin): An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index b7ed8e9074f0..52f39a913c54 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -64,10 +64,6 @@ class Emu3Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast") - image_processor_class = "Emu3ImageProcessor" - def __init__( self, image_processor, diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py index 3be0e07364a6..464ded903105 100644 --- a/src/transformers/models/evolla/processing_evolla.py +++ b/src/transformers/models/evolla/processing_evolla.py @@ -52,7 +52,6 @@ class EvollaProcessor(ProcessorMixin): # protein_tokenizer_class = "EsmTokenizer" # tokenizer_class = "LlamaTokenizerFast" protein_tokenizer_class = "AutoTokenizer" - tokenizer_class = "AutoTokenizer" protein_tokenizer_dir_name = "protein_tokenizer" # tokenizer_dir_name = "text_tokenizer" diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 272fb01d7b7a..7e5b3c0e012e 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -31,10 +31,6 @@ class FlavaProcessor(ProcessorMixin): tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "FlavaImageProcessor" - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py index 1c25ddceeafc..c8d699e4bc3e 100644 --- a/src/transformers/models/florence2/processing_florence2.py +++ b/src/transformers/models/florence2/processing_florence2.py @@ -62,10 +62,6 @@ class Florence2Processor(ProcessorMixin): thresholds, or banned tokens. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("BartTokenizer", "BartTokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index a715ce412313..ee697deccf9e 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -347,10 +347,6 @@ class FuyuProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "FuyuImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor=image_processor, tokenizer=tokenizer) self.image_processor = image_processor diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py index a9bac5b69e47..11574e30b7c1 100644 --- a/src/transformers/models/gemma3/processing_gemma3.py +++ b/src/transformers/models/gemma3/processing_gemma3.py @@ -42,10 +42,6 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): class Gemma3Processor(ProcessorMixin): - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py index 913336b8d3f5..51b686557ed0 100644 --- a/src/transformers/models/gemma3n/processing_gemma3n.py +++ b/src/transformers/models/gemma3n/processing_gemma3n.py @@ -51,11 +51,6 @@ class Gemma3nProcessor(ProcessorMixin): The number of image soft tokens that should be added to """ - attributes = ["feature_extractor", "image_processor", "tokenizer"] - feature_extractor_class = "AutoFeatureExtractor" - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, feature_extractor, diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 2eba7c68f584..89cfc9618987 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -33,10 +33,6 @@ class GitProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index e8f9c948c66d..79935cbde7b4 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -59,12 +59,6 @@ class Glm4vProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer", "video_processor"] - image_processor_class = "AutoImageProcessor" - video_processor_class = "AutoVideoProcessor" - - tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 1843b7f28830..162efef5e9f9 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -93,10 +93,6 @@ class GotOcr2Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "PreTrainedTokenizerFast" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py index 0b76ccfe75db..b3d3a8719185 100644 --- a/src/transformers/models/granite_speech/processing_granite_speech.py +++ b/src/transformers/models/granite_speech/processing_granite_speech.py @@ -32,7 +32,6 @@ class GraniteSpeechProcessor(ProcessorMixin): attributes = ["audio_processor", "tokenizer"] audio_processor_class = "GraniteSpeechFeatureExtractor" - tokenizer_class = "AutoTokenizer" def __init__( self, diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 5f2f900451b2..74565588d852 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -130,9 +130,6 @@ class GroundingDinoProcessor(ProcessorMixin): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "GroundingDinoImageProcessor" - tokenizer_class = "AutoTokenizer" valid_processor_kwargs = GroundingDinoProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index b0ad20df386b..7cb640e56854 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -153,10 +153,6 @@ class IdeficsProcessor(ProcessorMixin): The string representation of token representing end of utterance """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "IdeficsImageProcessor" - tokenizer_class = "LlamaTokenizerFast" - def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs): super().__init__(image_processor, tokenizer) self.image_token_id = ( diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c419a3641254..df5f9ca73a8b 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -75,10 +75,6 @@ class Idefics2Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics2ImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs ): diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 451af1d8a38f..373e3e3ed9f3 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -121,10 +121,6 @@ class Idefics3Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics3ImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs ): diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index afe43c1fc7a7..2ae4a0a8a229 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -65,9 +65,6 @@ class InstructBlipProcessor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["image_processor", "tokenizer", "qformer_tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = "AutoTokenizer" qformer_tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index ee4e843e2f33..f609b3e1be0c 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -55,11 +55,6 @@ class InstructBlipVideoProcessor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["video_processor", "tokenizer", "qformer_tokenizer"] - video_processor_class = "AutoVideoProcessor" - tokenizer_class = "AutoTokenizer" - qformer_tokenizer_class = "AutoTokenizer" - def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): if not hasattr(tokenizer, "video_token"): self.video_token = AddedToken("