diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py index 7153154048b6..c0ae21ecd84e 100644 --- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py @@ -232,9 +232,10 @@ def _preprocess( patches = stacked_videos # Check that videos have `num_frames` divisible by `temporal_patch_size` - if patches.shape[1] % temporal_patch_size != 0: - repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1) - patches = torch.cat([patches, repeats], dim=1) + T = patches.shape[1] + if pad := -T % temporal_patch_size: + repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1) + patches = torch.cat((patches, repeats), dim=1) batch_size, grid_t, channel = patches.shape[:3] grid_t = grid_t // temporal_patch_size diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index 8a70a1a68584..90d7dd0abfb9 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -40,8 +40,6 @@ def smart_resize( min_pixels: int = 128 * 128, max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144, ): - if num_frames < temporal_factor: - raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}") if height < factor or width < factor: raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") elif max(height, width) / min(height, width) > 200: @@ -50,7 +48,7 @@ def smart_resize( ) h_bar = round(height / factor) * factor w_bar = round(width / factor) * factor - t_bar = round(num_frames / temporal_factor) * temporal_factor + t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor if t_bar * h_bar * w_bar > max_pixels: beta = math.sqrt((num_frames * height * width) / max_pixels) @@ -232,9 +230,10 @@ def _preprocess( patches = stacked_videos # Check that videos have `num_frames` divisible by `temporal_patch_size` - if patches.shape[1] % temporal_patch_size != 0: - repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) - patches = torch.cat([patches, repeats], dim=1) + T = patches.shape[1] + if pad := -T % temporal_patch_size: + repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1) + patches = torch.cat((patches, repeats), dim=1) batch_size, grid_t, channel = patches.shape[:3] grid_t = grid_t // temporal_patch_size grid_h, grid_w = resized_height // patch_size, resized_width // patch_size diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py index b80adebbd9ab..0ccffca73fa7 100644 --- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py @@ -343,3 +343,22 @@ def test_call_sample_frames(self): # Assign back the actual num frames in tester self.video_processor_tester.num_frames = prev_num_frames + + def test_num_frames_equal_temporal_patch_size_plus_two(self): + for video_processing_class in self.video_processor_list: + video_processor_dict = self.video_processor_dict.copy() + video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28} + video_processor_dict["do_sample_frames"] = False + temporal_patch_size = 3 + video_processor_dict["temporal_patch_size"] = temporal_patch_size + video_processing = video_processing_class(**video_processor_dict) + + n, w, h = 5, 28, 28 + video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] + + video_processed = video_processing(video_inputs, return_tensors="pt") + encoded_videos = video_processed[self.input_name] + self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14]) + + video_grid_thw = video_processed["video_grid_thw"] + self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]]) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 60f4023938bb..d3b9423030c2 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -328,3 +328,22 @@ def test_call_sample_frames(self): self.video_processor_tester.min_resolution = prev_min_resolution if prev_max_resolution is not None: self.video_processor_tester.max_resolution = prev_max_resolution + + def test_num_frames_equal_temporal_patch_size_plus_two(self): + for video_processing_class in self.video_processor_list: + video_processor_dict = self.video_processor_dict.copy() + video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32} + video_processor_dict["do_sample_frames"] = False + temporal_patch_size = 3 + video_processor_dict["temporal_patch_size"] = temporal_patch_size + video_processing = video_processing_class(**video_processor_dict) + + n, w, h = 5, 32, 32 + video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)] + + video_processed = video_processing(video_inputs, return_tensors="pt") + encoded_videos = video_processed[self.input_name] + self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16]) + + video_grid_thw = video_processed["video_grid_thw"] + self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])