From c71c7e7febb8a274f5090252d2469cd13530c9c1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 20 Dec 2024 11:34:25 +0100 Subject: [PATCH 1/8] test 4.47 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d9b3b8642..cc0e8cb4b 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "optimum~=1.23", - "transformers>=4.36,<4.47", + "transformers>=4.36,<4.48", "datasets>=1.4.0", "sentencepiece", "setuptools", From 3e2cf34e6a4bd8bee2d101cd2be522d644b894e3 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 20 Dec 2024 12:13:28 +0100 Subject: [PATCH 2/8] update optimum --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cc0e8cb4b..0f02ef15c 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "optimum~=1.23", + "optimum@git+https://github.com/huggingface/optimum.git", "transformers>=4.36,<4.48", "datasets>=1.4.0", "sentencepiece", From 35f6fb6b75692ad21eac899548ed8e35febe32dc Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 20 Dec 2024 13:12:51 +0100 Subject: [PATCH 3/8] patch gemma attn functions --- optimum/exporters/openvino/model_patcher.py | 30 +++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 825eaac48..fb5bb7ac9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2712,7 +2712,14 @@ def patched_forward(*args, **kwargs): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.45.0"): + + if is_transformers_version(">=", "4.47.0"): + from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_FUNCTION + + GEMMA2_ATTENTION_FUNCTION["original_eager"] = GEMMA2_ATTENTION_FUNCTION["eager"] + GEMMA2_ATTENTION_FUNCTION["eager"] = GEMMA2_ATTENTION_FUNCTION["sdpa"] + + elif is_transformers_version(">=", "4.45.0"): from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"] @@ -2725,7 +2732,14 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.45.0"): + + if is_transformers_version(">=", "4.47.0"): + from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_FUNCTION + + GEMMA2_ATTENTION_FUNCTION["eager"] = GEMMA2_ATTENTION_FUNCTION["original_eager"] + del GEMMA2_ATTENTION_FUNCTION["original_eager"] + + elif is_transformers_version(">=", "4.45.0"): for layer in self._model.model.layers: if hasattr(layer.self_attn, "_orig_forward"): layer.self_attn.forward = layer.self_attn._orig_forward From 97f89797b06f5969fad985b64ee6ab810b4673ad Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 20 Dec 2024 13:51:39 +0100 Subject: [PATCH 4/8] style --- optimum/exporters/openvino/model_patcher.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index fb5bb7ac9..a43b48ae3 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" From 35c47a2252e63dcb284a666c0bf9ef497d05c8c5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 6 Jan 2025 09:32:57 +0100 Subject: [PATCH 5/8] force attn model --- optimum/exporters/openvino/__main__.py | 2 +- optimum/exporters/openvino/model_patcher.py | 46 +++------------------ 2 files changed, 7 insertions(+), 41 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 592cd85a4..859360e8b 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -49,7 +49,7 @@ ) -FORCE_ATTN_MODEL_CLASSES = {"phi3-v": "eager"} +FORCE_ATTN_MODEL_CLASSES = {"phi3-v": "eager", "gemma2": "sdpa"} if TYPE_CHECKING: from optimum.intel.openvino.configuration import OVConfig diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a43b48ae3..b19525810 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -2710,40 +2710,6 @@ def patched_forward(*args, **kwargs): self.patched_forward = patched_forward - def __enter__(self): - super().__enter__() - - if is_transformers_version(">=", "4.47.0"): - from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_FUNCTION - - GEMMA2_ATTENTION_FUNCTION["original_eager"] = GEMMA2_ATTENTION_FUNCTION["eager"] - GEMMA2_ATTENTION_FUNCTION["eager"] = GEMMA2_ATTENTION_FUNCTION["sdpa"] - - elif is_transformers_version(">=", "4.45.0"): - from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES - - sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"] - eager_attn = GEMMA2_ATTENTION_CLASSES["eager"] - - for layer in self._model.model.layers: - if isinstance(layer.self_attn, eager_attn): - layer.self_attn._orig_forward = layer.self_attn.forward - layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn) - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - - if is_transformers_version(">=", "4.47.0"): - from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_FUNCTION - - GEMMA2_ATTENTION_FUNCTION["eager"] = GEMMA2_ATTENTION_FUNCTION["original_eager"] - del GEMMA2_ATTENTION_FUNCTION["original_eager"] - - elif is_transformers_version(">=", "4.45.0"): - for layer in self._model.model.layers: - if hasattr(layer.self_attn, "_orig_forward"): - layer.self_attn.forward = layer.self_attn._orig_forward - def _decilm_attn_forward( self, From c6c4a2558a3fa957b299e1b30414001b60bbbeb8 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 6 Jan 2025 10:45:32 +0100 Subject: [PATCH 6/8] latest qwen2 vl position_ids formula --- optimum/exporters/openvino/model_patcher.py | 12 ++++---- .../openvino/modeling_visual_language.py | 28 ++++++++++++++++--- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b19525810..7a6b2998c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index fe85f9212..d0b281e19 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -53,7 +53,7 @@ if TYPE_CHECKING: - from PIL import Image + from PIL.Image import Image logger = logging.getLogger(__name__) @@ -166,9 +166,6 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: - position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) - inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: @@ -2100,6 +2097,8 @@ def __init__( quantization_config=quantization_config, **kwargs, ) + self.rope_deltas = None # cache rope_deltas here + if is_transformers_version(">=", "4.45.0"): from transformers.models.qwen2_vl.modeling_qwen2_vl import ( Qwen2VLForConditionalGeneration, @@ -2197,6 +2196,7 @@ def get_multimodal_embeddings( pixel_values_videos=None, image_grid_thw=None, video_grid_thw=None, + cache_position=None, **kwargs, ): inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) @@ -2209,6 +2209,26 @@ def get_multimodal_embeddings( video_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values_videos, video_grid_thw)) video_mask = input_ids == self.config.video_token_id inputs_embeds[video_mask] = video_embeds + + # if we get 4D attention mask we cannot calculate rope deltas anymore. + if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): + # calculate RoPE index once per generation in the pre-fill stage only + if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, image_grid_thw, video_grid_thw, attention_mask + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0 + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + return inputs_embeds, attention_mask, position_ids def forward( From 7c6d67cc611f6aece3615cefcddf96603a37dea1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 6 Jan 2025 10:45:32 +0100 Subject: [PATCH 7/8] latest qwen2 vl position_ids formula --- optimum/exporters/openvino/model_patcher.py | 12 ++++---- .../openvino/modeling_visual_language.py | 28 ++++++++++++++++--- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b19525810..7a6b2998c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -421,9 +421,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -2058,9 +2058,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index fe85f9212..d0b281e19 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -53,7 +53,7 @@ if TYPE_CHECKING: - from PIL import Image + from PIL.Image import Image logger = logging.getLogger(__name__) @@ -166,9 +166,6 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: - position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) - inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: @@ -2100,6 +2097,8 @@ def __init__( quantization_config=quantization_config, **kwargs, ) + self.rope_deltas = None # cache rope_deltas here + if is_transformers_version(">=", "4.45.0"): from transformers.models.qwen2_vl.modeling_qwen2_vl import ( Qwen2VLForConditionalGeneration, @@ -2197,6 +2196,7 @@ def get_multimodal_embeddings( pixel_values_videos=None, image_grid_thw=None, video_grid_thw=None, + cache_position=None, **kwargs, ): inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) @@ -2209,6 +2209,26 @@ def get_multimodal_embeddings( video_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values_videos, video_grid_thw)) video_mask = input_ids == self.config.video_token_id inputs_embeds[video_mask] = video_embeds + + # if we get 4D attention mask we cannot calculate rope deltas anymore. + if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): + # calculate RoPE index once per generation in the pre-fill stage only + if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, image_grid_thw, video_grid_thw, attention_mask + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0 + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + return inputs_embeds, attention_mask, position_ids def forward( From b6f7e7430e90ef4ae50da79b5fa14f73598105da Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 6 Jan 2025 12:17:47 +0100 Subject: [PATCH 8/8] revert --- optimum/intel/openvino/modeling_visual_language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index d0b281e19..1c0e35cca 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -166,6 +166,9 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] + if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: + position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) + inputs["position_ids"] = position_ids if "beam_idx" in self.input_names: