qwen2 vl position ids

huggingface · Jan 6, 2025 · 0307ec9 · 0307ec9
1 parent 35c47a2
commit 0307ec9
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 10 deletions.
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
-                mask_slice
-            )
+            causal_mask[
+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+            ] = mask_slice
 
     if (
         self.config._attn_implementation == "sdpa"
@@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy(
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
-                mask_slice
-            )
+            causal_mask[
+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+            ] = mask_slice
 
     if (
         self.config._attn_implementation == "sdpa"

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -53,7 +53,7 @@
 
 
 if TYPE_CHECKING:
-    from PIL import Image
+    from PIL.Image import Image
 
 
 logger = logging.getLogger(__name__)
@@ -166,9 +166,6 @@ def prepare_inputs(
             if past_len:
                 position_ids = position_ids[:, -inputs_embeds.shape[1] :]
 
-            if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3:
-                position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0)
-
             inputs["position_ids"] = position_ids
 
         if "beam_idx" in self.input_names:
@@ -2228,6 +2225,9 @@ def forward(
         rope_deltas=None,
         **kwargs,
     ):
+        if position_ids is None and input_ids is not None:
+            position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
+
         result = super().forward(
             input_ids,
             pixel_values,