From 544e4ed73f746a8248f4325ac79078e66645afb2 Mon Sep 17 00:00:00 2001 From: "Qile.Xu" Date: Sat, 8 Nov 2025 10:54:27 +0000 Subject: [PATCH] fix return_metadata_checking_logic --- src/transformers/models/glm4v/modular_glm4v.py | 2 +- src/transformers/models/glm4v/processing_glm4v.py | 2 +- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 2 +- src/transformers/models/qwen3_vl/processing_qwen3_vl.py | 2 +- src/transformers/models/smolvlm/processing_smolvlm.py | 2 +- src/transformers/models/video_llama_3/modular_video_llama_3.py | 2 +- .../models/video_llama_3/processing_video_llama_3.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 8ae513b63d44..17d41582b73c 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -1607,7 +1607,7 @@ def __call__( if videos is not None: videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index e8f9c948c66d..2823760c3137 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -135,7 +135,7 @@ def __call__( if videos is not None: videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 7758a23e2970..77eadc012b13 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -1346,7 +1346,7 @@ def __call__( videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index 85e7471a938d..5531807e5313 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -153,7 +153,7 @@ def __call__( videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 86d07e238f1b..74cc128eadad 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -345,7 +345,7 @@ def __call__( # If user has not requested video metadata, pop it. By default metadata # is always returned to expand video tokens correctly - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): vision_inputs.pop("video_metadata") inputs.update(vision_inputs) diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index 789248676ab7..cfcaf0dec655 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -1134,7 +1134,7 @@ def __call__( for grid_thw, merge_size in zip(videos_inputs["video_grid_thw"], videos_inputs["video_merge_sizes"]) ] video_compression_masks = videos_inputs["video_compression_mask"].split(num_video_tokens) - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py index 37127d736053..9a861bb9a5e6 100644 --- a/src/transformers/models/video_llama_3/processing_video_llama_3.py +++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py @@ -139,7 +139,7 @@ def __call__( for grid_thw, merge_size in zip(videos_inputs["video_grid_thw"], videos_inputs["video_merge_sizes"]) ] video_compression_masks = videos_inputs["video_compression_mask"].split(num_video_tokens) - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"]