Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,10 @@ def _preprocess(
patches = stacked_videos

# Check that videos have `num_frames` divisible by `temporal_patch_size`
if patches.shape[1] % temporal_patch_size != 0:
repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1)
patches = torch.cat([patches, repeats], dim=1)
T = patches.shape[1]
if pad := -T % temporal_patch_size:
repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
patches = torch.cat((patches, repeats), dim=1)

batch_size, grid_t, channel = patches.shape[:3]
grid_t = grid_t // temporal_patch_size
Expand Down
11 changes: 5 additions & 6 deletions src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ def smart_resize(
min_pixels: int = 128 * 128,
max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144,
):
if num_frames < temporal_factor:
raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
if height < factor or width < factor:
raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
elif max(height, width) / min(height, width) > 200:
Expand All @@ -50,7 +48,7 @@ def smart_resize(
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
t_bar = round(num_frames / temporal_factor) * temporal_factor
t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor

if t_bar * h_bar * w_bar > max_pixels:
beta = math.sqrt((num_frames * height * width) / max_pixels)
Expand Down Expand Up @@ -232,9 +230,10 @@ def _preprocess(
patches = stacked_videos

# Check that videos have `num_frames` divisible by `temporal_patch_size`
if patches.shape[1] % temporal_patch_size != 0:
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
patches = torch.cat([patches, repeats], dim=1)
T = patches.shape[1]
if pad := -T % temporal_patch_size:
repeats = patches[:, -1:].expand(-1, pad, -1, -1, -1)
patches = torch.cat((patches, repeats), dim=1)
batch_size, grid_t, channel = patches.shape[:3]
grid_t = grid_t // temporal_patch_size
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
Expand Down
19 changes: 19 additions & 0 deletions tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,22 @@ def test_call_sample_frames(self):

# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames

def test_num_frames_equal_temporal_patch_size_plus_two(self):
for video_processing_class in self.video_processor_list:
video_processor_dict = self.video_processor_dict.copy()
video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28}
video_processor_dict["do_sample_frames"] = False
temporal_patch_size = 3
video_processor_dict["temporal_patch_size"] = temporal_patch_size
video_processing = video_processing_class(**video_processor_dict)

n, w, h = 5, 28, 28
video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]

video_processed = video_processing(video_inputs, return_tensors="pt")
encoded_videos = video_processed[self.input_name]
self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14])

video_grid_thw = video_processed["video_grid_thw"]
self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])
19 changes: 19 additions & 0 deletions tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,22 @@ def test_call_sample_frames(self):
self.video_processor_tester.min_resolution = prev_min_resolution
if prev_max_resolution is not None:
self.video_processor_tester.max_resolution = prev_max_resolution

def test_num_frames_equal_temporal_patch_size_plus_two(self):
for video_processing_class in self.video_processor_list:
video_processor_dict = self.video_processor_dict.copy()
video_processor_dict["size"] = {"longest_edge": 5 * 32 * 32, "shortest_edge": 32 * 32}
video_processor_dict["do_sample_frames"] = False
temporal_patch_size = 3
video_processor_dict["temporal_patch_size"] = temporal_patch_size
video_processing = video_processing_class(**video_processor_dict)

n, w, h = 5, 32, 32
video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]

video_processed = video_processing(video_inputs, return_tensors="pt")
encoded_videos = video_processed[self.input_name]
self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 16 * 16])

video_grid_thw = video_processed["video_grid_thw"]
self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])