From f3e3433f9763eb0d95c9c96ecddec7f72fe5ff1b Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Mon, 2 Mar 2026 19:19:37 +0900 Subject: [PATCH 01/14] Add support for Mistral3 --- docs/source/openvino/models.mdx | 1 + optimum/exporters/openvino/model_configs.py | 138 ++++++++++++++++++ optimum/exporters/openvino/model_patcher.py | 60 ++++++++ optimum/exporters/openvino/utils.py | 1 + .../openvino/modeling_visual_language.py | 89 +++++++++++ 5 files changed, 289 insertions(+) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 51200060e8..ec8c53826d 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -105,6 +105,7 @@ Here is the list of the supported architectures : - MiniCPM-o - MiniCPMV - Mistral +- Mistral3 - Mixtral - MobileBert - MobileNet v1 diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7ffe158396..9376a99ae4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -175,6 +175,8 @@ MiniCPMModelPatcher, MiniCPMVImageEmbeddingsModelPatcher, MiniCPMVResamplerModelPatcher, + Mistral3ImageEmbeddingModelPatcher, + Mistral3MultiModalProjectorPatcher, MistralModelPatcher, MixtralModelPatcher, MPTModelPatcher, @@ -257,6 +259,10 @@ def init_model_configs(): "transformers", "AutoModelForVision2Seq", ) + TasksManager._CUSTOM_CLASSES[("pt", "mistral3", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( "transformers", "Gemma3ForConditionalGeneration", @@ -2053,6 +2059,138 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs) +class Mistral3ConfigBehavior(str, enum.Enum): + LANGUAGE = "language" + VISION_EMBEDDINGS = "vision_embeddings" + TEXT_EMBEDDINGS = "text_embeddings" + MULTI_MODAL_PROJECTOR = "multi_modal_projector" + + +class DummyMistral3MultiModalProjectorInputGenerator(DummyInputGenerator): + SUPPORTED_INPUT_NAMES = ["image_features"] + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + self.task = task + self.batch_size = batch_size + self.hidden_size = normalized_config.hidden_size + self.spatial_merge_size = getattr( + normalized_config.config, "spatial_merge_size", + getattr(normalized_config, "spatial_merge_size", 2) + ) + image_size = normalized_config.image_size + patch_size = normalized_config.patch_size + patches_per_side = image_size // patch_size + merged_per_side = patches_per_side // self.spatial_merge_size + self.num_merged_patches = merged_per_side * merged_per_side + + def generate( + self, + input_name: str, + framework: str = "pt", + int_dtype: str = "int64", + float_dtype: str = "fp32", + ): + input_dim = self.hidden_size * self.spatial_merge_size ** 2 + shape = [self.num_merged_patches, input_dim] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + + +class Mistral3MultiModalProjectorOpenVINOConfig(OnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyMistral3MultiModalProjectorInputGenerator,) + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + _MODEL_PATCHER = Mistral3MultiModalProjectorPatcher + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return {"image_features": {0: "num_patches"}} + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return {"hidden_states": {0: "num_patches"}} + + +@register_in_tasks_manager("mistral3", *["image-text-to-text"], library_name="transformers") +class Mistral3OpenVINOConfig(BaseVLMOpenVINOConfig): + MIN_TRANSFORMERS_VERSION = "4.50.0" + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Mistral3ConfigBehavior] + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + **kwargs, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._orig_config = config + if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + + def with_behavior( + self, + behavior: Union[str, Mistral3ConfigBehavior], + ): + if isinstance(behavior, str) and not isinstance(behavior, Mistral3ConfigBehavior): + behavior = Mistral3ConfigBehavior(behavior) + + if behavior == Mistral3ConfigBehavior.MULTI_MODAL_PROJECTOR: + return Mistral3MultiModalProjectorOpenVINOConfig( + self._orig_config.vision_config, + task="feature-extraction", + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + + return super().with_behavior(behavior) + + def get_model_for_behavior( + self, model, behavior: Union[str, Mistral3ConfigBehavior] + ): + if isinstance(behavior, str) and not isinstance(behavior, Mistral3ConfigBehavior): + behavior = Mistral3ConfigBehavior(behavior) + + if behavior == Mistral3ConfigBehavior.MULTI_MODAL_PROJECTOR: + return ( + model.multi_modal_projector + if hasattr(model, "multi_modal_projector") + else model.model.multi_modal_projector + ) + + return super().get_model_for_behavior(model, behavior) + + def patch_model_for_export( + self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + + if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: + return super().patch_model_for_export(model, model_kwargs) + + return Mistral3ImageEmbeddingModelPatcher(self, model, model_kwargs) + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: + if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and self._config.model_type == "pixtral": + kwargs["batch_size"] = 1 + return super().generate_dummy_inputs(framework, **kwargs) + + @register_in_tasks_manager( "maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" ) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 557cd1f8d1..6e232f93f9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3307,6 +3307,66 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward +# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L223-L248 +# Mistral3Model.get_image_features() with only projector.norm() applied instead of full projector forward, +# as the patch_merger cycle block (unfold loop) cannot be traced to OpenVINO IR. +def mistral3_vision_embed_forward(self, pixel_values): + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + vision_feature_layer = self.config.vision_feature_layer + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + image_features = self.multi_modal_projector.norm(selected_image_feature.squeeze(0)) + return image_features + + +# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L118-L124 +# Mistral3MultiModalProjector.forward() and Mistral3PatchMerger.forward() with norm and cycle block excluded. +# norm is moved to vision_embed_forward, cycle block runs in PyTorch at runtime. +def mistral3_multi_modal_projector_forward(self, image_features): + hidden_states = self.patch_merger.merging_layer(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class Mistral3ImageEmbeddingModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(mistral3_vision_embed_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Mistral3MultiModalProjectorPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(mistral3_multi_modal_projector_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor: def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: assert dim % 2 == 0, "The dimension must be even." diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 3d9a854e39..edbaa65ce2 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -288,6 +288,7 @@ def get_submodels(model): "llava_next", "llava_next_video", "llava-qwen2", + "mistral3", "internvl_chat", "maira2", "minicpmv", diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 2fe8cb0ea0..e26f47c4a3 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1699,6 +1699,94 @@ def get_video_features(self, pixel_values, input_ids=None, **kwargs): return video_features +class _OVMistral3ForCausalLM(OVModelForVisualCausalLM): + additional_parts = ["multi_modal_projector"] + + def get_vision_embeddings(self, pixel_values, input_ids=None, image_sizes=None, **kwargs): + if input_ids is not None and input_ids.shape[1] == 1: + return None + + image_features = self.vision_embeddings(pixel_values).last_hidden_state + image_features = torch.from_numpy(image_features) if isinstance(image_features, np.ndarray) else image_features + + # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L75-L96 + patch_size = self.config.vision_config.patch_size + spatial_merge_size = self.config.spatial_merge_size + d = image_features.shape[-1] + + image_sizes_scaled = [ + (size[0] // patch_size, size[1] // patch_size) + for size in image_sizes + ] + tokens_per_image = [h * w for h, w in image_sizes_scaled] + + permuted_tensor = [] + for image_index, image_tokens in enumerate(image_features.split(tokens_per_image)): + h, w = image_sizes_scaled[image_index] + image_grid = image_tokens.view(h, w, d).permute(2, 0, 1).unsqueeze(0) + grid = torch.nn.functional.unfold( + image_grid, + kernel_size=spatial_merge_size, + stride=spatial_merge_size, + ) + grid = grid.view(d * spatial_merge_size**2, -1).t() + permuted_tensor.append(grid) + + image_features = torch.cat(permuted_tensor, dim=0) + image_features = self.multi_modal_projector(image_features) + + return image_features + + # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L258-L280 + # and https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L313-L324 + def merge_vision_text_embeddings( + self, + vision_embeds, + inputs_embeds, + input_ids=None, + attention_mask=None, + position_ids=None, + **kwargs, + ): + image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds + inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds + + special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + return inputs_embeds, attention_mask, position_ids + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + video: Optional["VideoInput"] = None, + audio: Optional[np.ndarray] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if video is not None or audio is not None: + raise ValueError("Video/Audio input is not supported for Mistral3") + + conversation = [ + { + "role": "user", + "content": [{"type": "text", "text": text}], + } + ] + if image is not None: + conversation[0]["content"].insert(0, {"type": "image"}) + + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + inputs = processor(images=image, text=prompt, return_tensors="pt") + return inputs + + class _OVInternVLForCausalLM(OVModelForVisualCausalLM): def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): if input_ids is not None and input_ids.shape[1] == 1: @@ -4806,6 +4894,7 @@ def preprocess_inputs( "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, "llava_next_video": _OVLlavaNextVideoForCausalLM, + "mistral3": _OVMistral3ForCausalLM, "minicpmv": _OVMiniCPMVForCausalLM, "llava-qwen2": _OVNanoLlavaForCausalLM, "maira2": _OVMaira2ForCausalLM, From 4808de0f5877076cd0d88444d673789a9fb895a3 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Wed, 4 Mar 2026 01:01:56 +0900 Subject: [PATCH 02/14] Add tests --- tests/openvino/test_export.py | 3 +++ tests/openvino/test_exporters_cli.py | 7 +++++++ tests/openvino/test_genai.py | 1 + tests/openvino/test_quantization.py | 3 +++ tests/openvino/test_seq2seq.py | 3 +++ tests/openvino/utils_tests.py | 7 +++++++ 6 files changed, 24 insertions(+) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index d1c373e2bc..9dde7a29f3 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -101,6 +101,9 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.49"): SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM}) + if is_transformers_version(">=", "4.50.0"): + SUPPORTED_ARCHITECTURES.update({"mistral3": OVModelForVisualCausalLM}) + if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM}) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index edbc01e310..005151712d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -123,6 +123,13 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) + if is_transformers_version(">=", "4.50.0"): + SUPPORTED_ARCHITECTURES.extend( + [ + ("image-text-to-text", "mistral3"), + ] + ) + if is_transformers_version(">=", "4.54.0"): SUPPORTED_ARCHITECTURES.extend( [ diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 2d075e7874..4e2939190d 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -243,6 +243,7 @@ def _get_model_class(self, model_arch): "llava", "llava_next", "llava_next_mistral", + "mistral3", "qwen2_vl", "qwen2_5_vl", "gemma3", diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index bfc6ec976a..8c4f8a9370 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1077,6 +1077,9 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "cohere2", False)) + if is_transformers_version(">=", "4.50.0"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "mistral3", False)) + if is_transformers_version(">=", "4.54.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "exaone4", True)) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index ac91b2f4ad..428bf2101f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -545,6 +545,8 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORT_AUDIO.append("phi4mm") if is_transformers_version(">", "4.49"): SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] + if is_transformers_version(">=", "4.50.0"): + SUPPORTED_ARCHITECTURES += ["mistral3"] if is_transformers_version(">=", "4.51"): # SUPPORTED_ARCHITECTURES += ["llama4", "phi4_multimodal"] SUPPORTED_ARCHITECTURES += ["llama4"] @@ -572,6 +574,7 @@ def get_transformer_model_class(self, model_arch): "llava", "llava_next", "llava_next_mistral", + "mistral3", "qwen2_vl", "qwen2_5_vl", "got_ocr2", diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 230ec88e45..510923e31c 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -139,6 +139,7 @@ "minicpmo": "optimum-intel-internal-testing/tiny-random-MiniCPM-o-2_6", "mistral": "optimum-intel-internal-testing/tiny-random-mistral", "mistral-nemo": "optimum-intel-internal-testing/tiny-random-mistral-nemo", + "mistral3": "./tiny-random-mistral3", "mixtral": "optimum-intel-internal-testing/tiny-mixtral", "mixtral_awq": "optimum-intel-internal-testing/tiny-mixtral-AWQ-4bit", "mobilebert": "optimum-intel-internal-testing/tiny-random-MobileBertModel", @@ -321,6 +322,12 @@ "text_embeddings_model": 1, "vision_embeddings_model": 15, }, + "mistral3": { + "lm_model": 30, + "text_embeddings_model": 1, + "vision_embeddings_model": 16, + "mulit_modal_projector_model": 3, + }, "qwen2_vl": { "lm_model": 30, "text_embeddings_model": 1, From 3100050ad885a72ac36119355ecb1b419d6b366c Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Wed, 4 Mar 2026 16:05:29 +0900 Subject: [PATCH 03/14] Fix typo --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 510923e31c..9f7bf2e861 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -326,7 +326,7 @@ "lm_model": 30, "text_embeddings_model": 1, "vision_embeddings_model": 16, - "mulit_modal_projector_model": 3, + "multi_modal_projector_model": 3, }, "qwen2_vl": { "lm_model": 30, From 976843b4c573163a38fe1a39405feabc3ff0fb87 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Wed, 4 Mar 2026 18:36:05 +0900 Subject: [PATCH 04/14] Update test model path --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 9f7bf2e861..73bdd18189 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -139,7 +139,7 @@ "minicpmo": "optimum-intel-internal-testing/tiny-random-MiniCPM-o-2_6", "mistral": "optimum-intel-internal-testing/tiny-random-mistral", "mistral-nemo": "optimum-intel-internal-testing/tiny-random-mistral-nemo", - "mistral3": "./tiny-random-mistral3", + "mistral3": "optimum-intel-internal-testing/tiny-random-mistral3", "mixtral": "optimum-intel-internal-testing/tiny-mixtral", "mixtral_awq": "optimum-intel-internal-testing/tiny-mixtral-AWQ-4bit", "mobilebert": "optimum-intel-internal-testing/tiny-random-MobileBertModel", From 52c4b61bda80c6156a1761ad589debc013a83dd6 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 7 Mar 2026 12:46:46 +0900 Subject: [PATCH 05/14] Remove redundant custom mapping --- optimum/exporters/openvino/model_configs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 9376a99ae4..3326f4b4c0 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -259,10 +259,6 @@ def init_model_configs(): "transformers", "AutoModelForVision2Seq", ) - TasksManager._CUSTOM_CLASSES[("pt", "mistral3", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( "transformers", "Gemma3ForConditionalGeneration", From 422709ac6fc3df2bab5e5800e6245cfda43c4006 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 7 Mar 2026 12:53:06 +0900 Subject: [PATCH 06/14] Fix model naming in docs --- docs/source/openvino/models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index ec8c53826d..99b2c9008d 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -105,7 +105,7 @@ Here is the list of the supported architectures : - MiniCPM-o - MiniCPMV - Mistral -- Mistral3 +- Mistral 3 - Mixtral - MobileBert - MobileNet v1 From a6bb58ed5bfe7d165d6753320b6db98123530197 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 7 Mar 2026 14:04:15 +0900 Subject: [PATCH 07/14] Refactor projector input generator to inherit from LLava --- optimum/exporters/openvino/model_configs.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 3326f4b4c0..793e9a1928 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2062,9 +2062,7 @@ class Mistral3ConfigBehavior(str, enum.Enum): MULTI_MODAL_PROJECTOR = "multi_modal_projector" -class DummyMistral3MultiModalProjectorInputGenerator(DummyInputGenerator): - SUPPORTED_INPUT_NAMES = ["image_features"] - +class DummyMistral3MultiModalProjectorInputGenerator(DummyLLavaMultiModalProjectorInputGenerator): def __init__( self, task: str, @@ -2073,18 +2071,12 @@ def __init__( random_batch_size_range: Optional[Tuple[int, int]] = None, **kwargs, ): - self.task = task - self.batch_size = batch_size - self.hidden_size = normalized_config.hidden_size + super().__init__(task, normalized_config, batch_size, random_batch_size_range, **kwargs) self.spatial_merge_size = getattr( normalized_config.config, "spatial_merge_size", getattr(normalized_config, "spatial_merge_size", 2) ) - image_size = normalized_config.image_size - patch_size = normalized_config.patch_size - patches_per_side = image_size // patch_size - merged_per_side = patches_per_side // self.spatial_merge_size - self.num_merged_patches = merged_per_side * merged_per_side + self.num_merged_patches = self.num_patches // (self.spatial_merge_size ** 2) def generate( self, From 56eb853609c442f773946c19cb7f62dc1b962e4e Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 7 Mar 2026 14:26:13 +0900 Subject: [PATCH 08/14] Update comments and links --- optimum/exporters/openvino/model_configs.py | 5 +++++ optimum/exporters/openvino/model_patcher.py | 5 +++-- optimum/intel/openvino/modeling_visual_language.py | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 793e9a1928..15f20db2dd 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2057,6 +2057,11 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ class Mistral3ConfigBehavior(str, enum.Enum): LANGUAGE = "language" + # VISION_EMBEDDINGS extracts visual features and applies projector.norm(). + # Combined with the cycle block + # (https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L76-L94) + # and MULTI_MODAL_PROJECTOR, this is equivalent to get_image_features + # (https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L223-L248). VISION_EMBEDDINGS = "vision_embeddings" TEXT_EMBEDDINGS = "text_embeddings" MULTI_MODAL_PROJECTOR = "multi_modal_projector" diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6e232f93f9..b43e0e402c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3307,7 +3307,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L223-L248 +# Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L223-L248 # Mistral3Model.get_image_features() with only projector.norm() applied instead of full projector forward, # as the patch_merger cycle block (unfold loop) cannot be traced to OpenVINO IR. def mistral3_vision_embed_forward(self, pixel_values): @@ -3322,7 +3322,8 @@ def mistral3_vision_embed_forward(self, pixel_values): return image_features -# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L118-L124 +# Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L76-L94 +# and https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L118-L124 # Mistral3MultiModalProjector.forward() and Mistral3PatchMerger.forward() with norm and cycle block excluded. # norm is moved to vision_embed_forward, cycle block runs in PyTorch at runtime. def mistral3_multi_modal_projector_forward(self, image_features): diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index e26f47c4a3..30f9259717 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1709,7 +1709,7 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, image_sizes=None, image_features = self.vision_embeddings(pixel_values).last_hidden_state image_features = torch.from_numpy(image_features) if isinstance(image_features, np.ndarray) else image_features - # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L75-L96 + # Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L75-L96 patch_size = self.config.vision_config.patch_size spatial_merge_size = self.config.spatial_merge_size d = image_features.shape[-1] @@ -1737,8 +1737,8 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, image_sizes=None, return image_features - # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L258-L280 - # and https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral3/modeling_mistral3.py#L313-L324 + # Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L258-L280 + # and https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L313-L324 def merge_vision_text_embeddings( self, vision_embeds, From 2f288b027b3f7c5478cdcbda4cbe278a4260eb93 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Fri, 13 Mar 2026 16:57:29 +0900 Subject: [PATCH 09/14] Fix test --- tests/openvino/test_exporters_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 005151712d..22d80f6c66 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -192,6 +192,7 @@ class OVCLIExportTestCase(unittest.TestCase): if is_openvino_version(">=", "2026.0") else 0, # Tokenizers fail to convert on 2025.4, ticket: CVS-176880 "llava": 2, + "mistral3": 2, "sana": 2, "ltx-video": 2, "sam": 0, # no tokenizer From a46ff83e6c02d5a37944cfb0b8a7428cf429210b Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Fri, 13 Mar 2026 17:07:10 +0900 Subject: [PATCH 10/14] Fix code style --- optimum/exporters/openvino/model_configs.py | 18 +++----- optimum/exporters/openvino/model_patcher.py | 9 ++-- optimum/exporters/openvino/utils.py | 1 - .../openvino/modeling_visual_language.py | 8 +--- tests/openvino/test_export.py | 1 - tests/openvino/test_exporters_cli.py | 42 +++++++++++-------- tests/openvino/test_genai.py | 1 - tests/openvino/test_quantization.py | 22 ++++++---- tests/openvino/test_seq2seq.py | 1 - tests/openvino/utils_tests.py | 1 - 10 files changed, 49 insertions(+), 55 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 15f20db2dd..874221a2d5 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -205,7 +205,6 @@ Zamba2ModelPatcher, ) - COMMON_TEXT_TASKS = [ "feature-extraction", "fill-mask", @@ -2078,10 +2077,9 @@ def __init__( ): super().__init__(task, normalized_config, batch_size, random_batch_size_range, **kwargs) self.spatial_merge_size = getattr( - normalized_config.config, "spatial_merge_size", - getattr(normalized_config, "spatial_merge_size", 2) + normalized_config.config, "spatial_merge_size", getattr(normalized_config, "spatial_merge_size", 2) ) - self.num_merged_patches = self.num_patches // (self.spatial_merge_size ** 2) + self.num_merged_patches = self.num_patches // (self.spatial_merge_size**2) def generate( self, @@ -2090,7 +2088,7 @@ def generate( int_dtype: str = "int64", float_dtype: str = "fp32", ): - input_dim = self.hidden_size * self.spatial_merge_size ** 2 + input_dim = self.hidden_size * self.spatial_merge_size**2 shape = [self.num_merged_patches, input_dim] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) @@ -2107,7 +2105,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: return {"hidden_states": {0: "num_patches"}} - + @register_in_tasks_manager("mistral3", *["image-text-to-text"], library_name="transformers") class Mistral3OpenVINOConfig(BaseVLMOpenVINOConfig): @@ -2153,9 +2151,7 @@ def with_behavior( return super().with_behavior(behavior) - def get_model_for_behavior( - self, model, behavior: Union[str, Mistral3ConfigBehavior] - ): + def get_model_for_behavior(self, model, behavior: Union[str, Mistral3ConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, Mistral3ConfigBehavior): behavior = Mistral3ConfigBehavior(behavior) @@ -2168,9 +2164,7 @@ def get_model_for_behavior( return super().get_model_for_behavior(model, behavior) - def patch_model_for_export( - self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None - ): + def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b43e0e402c..0924ce27e6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -56,7 +56,6 @@ ) from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version - if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock @@ -2246,7 +2245,7 @@ def _persimmon_self_attn_sdpa_forward( fused_qkv = self.query_key_value(hidden_states) # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_states, key_states, value_states) = self._split_heads(fused_qkv) + query_states, key_states, value_states = self._split_heads(fused_qkv) if self.qk_layernorm: query_states = self.q_layernorm(query_states) @@ -3322,7 +3321,7 @@ def mistral3_vision_embed_forward(self, pixel_values): return image_features -# Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L76-L94 +# Adopted from https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L76-L94 # and https://github.com/huggingface/transformers/blob/v5.2.0/src/transformers/models/mistral3/modeling_mistral3.py#L118-L124 # Mistral3MultiModalProjector.forward() and Mistral3PatchMerger.forward() with norm and cycle block excluded. # norm is moved to vision_embed_forward, cycle block runs in PyTorch at runtime. @@ -6337,8 +6336,8 @@ def __enter__(self): if is_transformers_version(">=", "4.56"): # openvino is not able to trace through the new chunked_overlay with left_padding self.original_chunked_overlay = transformers.masking_utils.chunked_overlay - transformers.masking_utils.chunked_overlay = ( - lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) + transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: ( + transformers.masking_utils._legacy_chunked_overlay(chunk_size) ) def __exit__(self, exc_type, exc_value, traceback): diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index edbaa65ce2..0588359026 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -30,7 +30,6 @@ from optimum.utils import is_diffusers_available from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors - logger = logging.getLogger(__name__) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 30f9259717..987f132eb4 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -44,7 +44,6 @@ classproperty, ) - if is_transformers_version(">=", "4.46.0"): from transformers import AutoModelForImageTextToText @@ -1714,10 +1713,7 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, image_sizes=None, spatial_merge_size = self.config.spatial_merge_size d = image_features.shape[-1] - image_sizes_scaled = [ - (size[0] // patch_size, size[1] // patch_size) - for size in image_sizes - ] + image_sizes_scaled = [(size[0] // patch_size, size[1] // patch_size) for size in image_sizes] tokens_per_image = [h * w for h, w in image_sizes_scaled] permuted_tensor = [] @@ -1785,7 +1781,7 @@ def preprocess_inputs( prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = processor(images=image, text=prompt, return_tensors="pt") return inputs - + class _OVInternVLForCausalLM(OVModelForVisualCausalLM): def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 9dde7a29f3..1d765561ee 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -62,7 +62,6 @@ from optimum.utils import logging from optimum.utils.save_utils import maybe_load_preprocessors - logger = logging.get_logger() diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 22d80f6c66..9227839458 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -188,9 +188,9 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6, "flux": 4, "flux-fill": 4, - "lfm2": 2 - if is_openvino_version(">=", "2026.0") - else 0, # Tokenizers fail to convert on 2025.4, ticket: CVS-176880 + "lfm2": ( + 2 if is_openvino_version(">=", "2026.0") else 0 + ), # Tokenizers fail to convert on 2025.4, ticket: CVS-176880 "llava": 2, "mistral3": 2, "sana": 2, @@ -293,9 +293,11 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "int8", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - {"encoder": 14, "decoder": 22, "decoder_with_past": 22} - if is_transformers_version("<=", "4.45") - else {"encoder": 14, "decoder": 22, "decoder_with_past": 25}, + ( + {"encoder": 14, "decoder": 22, "decoder_with_past": 22} + if is_transformers_version("<=", "4.45") + else {"encoder": 14, "decoder": 22, "decoder_with_past": 25} + ), ( {"encoder": {"int8": 14}, "decoder": {"int8": 22}, "decoder_with_past": {"int8": 17}} if is_transformers_version("<=", "4.45") @@ -307,9 +309,11 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "f8e4m3", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - {"encoder": 16, "decoder": 26, "decoder_with_past": 23} - if is_transformers_version("<=", "4.45") - else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}, + ( + {"encoder": 16, "decoder": 26, "decoder_with_past": 23} + if is_transformers_version("<=", "4.45") + else {"encoder": 16, "decoder": 26, "decoder_with_past": 25} + ), ( {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}} if is_transformers_version("<=", "4.45") @@ -485,12 +489,14 @@ class OVCLIExportTestCase(unittest.TestCase): "t5", "int8", "--dataset c4:seq_len=64 --num-samples 1", - {"encoder": 30, "decoder": 52, "decoder_with_past": 61} - if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52, - }, + ( + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.45") + else { + "encoder": 30, + "decoder": 52, + } + ), ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -762,9 +768,9 @@ class OVCLIExportTestCase(unittest.TestCase): 'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", { - "lm_model": {"int8": 10, "int4": 20} - if is_transformers_version(">=", "4.54") - else {"int8": 6, "int4": 24}, + "lm_model": ( + {"int8": 10, "int4": 20} if is_transformers_version(">=", "4.54") else {"int8": 6, "int4": 24} + ), "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 1}, "vision_embeddings_merger_model": {"int8": 12}, diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 4e2939190d..ad8a5a3b12 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -36,7 +36,6 @@ from optimum.intel.utils.import_utils import is_openvino_version from optimum.utils import is_transformers_version - os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 8c4f8a9370..fb7a4dbdc6 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -340,12 +340,14 @@ class OVQuantizerTest(unittest.TestCase): dataset="wikitext2:seq_len=64", num_samples=1, ), - {"encoder": 30, "decoder": 52, "decoder_with_past": 61} - if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52, - }, + ( + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.45") + else { + "encoder": 30, + "decoder": 52, + } + ), ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -639,9 +641,11 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=32, ignored_scope={ "names": [ - "__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" - if is_transformers_version("<", "4.57") - else "__module.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + ( + "__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + if is_transformers_version("<", "4.57") + else "__module.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + ) ] }, ), diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 428bf2101f..6089042018 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -69,7 +69,6 @@ from optimum.intel.pipelines import pipeline as optimum_pipeline from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version - os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 73bdd18189..ca5058e2b4 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -24,7 +24,6 @@ from optimum.exporters.tasks import TasksManager from optimum.intel.utils.import_utils import is_transformers_version - SEED = 42 F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"} From 1d4c1e905cec79e50461dddbc36b598ebe0cf94f Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 14 Mar 2026 12:05:36 +0900 Subject: [PATCH 11/14] Fix test --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index ab907cd486..1ce5902caf 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -731,7 +731,7 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): transformers_model.config.eos_token_id = None ov_model.generation_config.do_sample = False # minicpmo diverges after 20 tokens - tokens_to_generate = 20 if model_arch == "minicpmo" else 30 + tokens_to_generate = 20 if model_arch in ["minicpmo", "mistral3"] else 30 gen_config = GenerationConfig( max_new_tokens=tokens_to_generate, min_new_tokens=tokens_to_generate, From 316ffa463c9e058901d7f2561df29d7f7a864a0e Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Sat, 14 Mar 2026 12:08:46 +0900 Subject: [PATCH 12/14] Fix code style --- optimum/exporters/openvino/model_configs.py | 1 + optimum/exporters/openvino/utils.py | 1 + optimum/intel/openvino/modeling_visual_language.py | 1 + tests/openvino/test_export.py | 1 + tests/openvino/test_genai.py | 1 + tests/openvino/test_seq2seq.py | 1 + 6 files changed, 6 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 97529120ed..fef7dc361a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -206,6 +206,7 @@ Zamba2ModelPatcher, ) + COMMON_TEXT_TASKS = [ "feature-extraction", "fill-mask", diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 86c5197bda..59d3215771 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -30,6 +30,7 @@ from optimum.utils import is_diffusers_available from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors + logger = logging.getLogger(__name__) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 31d3e1a1ae..c69c701b27 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -44,6 +44,7 @@ classproperty, ) + if is_transformers_version(">=", "4.46.0"): from transformers import AutoModelForImageTextToText diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 05d1c18d13..2caecd29fa 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -62,6 +62,7 @@ from optimum.utils import logging from optimum.utils.save_utils import maybe_load_preprocessors + logger = logging.get_logger() diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index ad8a5a3b12..4e2939190d 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -36,6 +36,7 @@ from optimum.intel.utils.import_utils import is_openvino_version from optimum.utils import is_transformers_version + os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 1ce5902caf..bcf51b6945 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -69,6 +69,7 @@ from optimum.intel.pipelines import pipeline as optimum_pipeline from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version + os.environ["TOKENIZERS_PARALLELISM"] = "false" From aa397b2445b0ac5be758e59acdbfd9bccd94547d Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Mon, 16 Mar 2026 19:02:45 +0900 Subject: [PATCH 13/14] Fix code style --- tests/openvino/utils_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 7760da7678..d2120aa50f 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -24,6 +24,7 @@ from optimum.exporters.tasks import TasksManager from optimum.intel.utils.import_utils import is_transformers_version + SEED = 42 F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"} From 46cf664e8263a1ca5120cdbabf23538863bff932 Mon Sep 17 00:00:00 2001 From: kyoui-dev Date: Wed, 18 Mar 2026 21:49:17 +0900 Subject: [PATCH 14/14] Revert unintended code style change --- optimum/exporters/openvino/model_patcher.py | 6 +-- tests/openvino/test_exporters_cli.py | 42 +++++++++------------ tests/openvino/test_quantization.py | 22 +++++------ 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index dd898df905..2be5ecaa84 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2248,7 +2248,7 @@ def _persimmon_self_attn_sdpa_forward( fused_qkv = self.query_key_value(hidden_states) # 3 x [batch_size, seq_length, num_heads, head_dim] - query_states, key_states, value_states = self._split_heads(fused_qkv) + (query_states, key_states, value_states) = self._split_heads(fused_qkv) if self.qk_layernorm: query_states = self.q_layernorm(query_states) @@ -6339,8 +6339,8 @@ def __enter__(self): if is_transformers_version(">=", "4.56"): # openvino is not able to trace through the new chunked_overlay with left_padding self.original_chunked_overlay = transformers.masking_utils.chunked_overlay - transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: ( - transformers.masking_utils._legacy_chunked_overlay(chunk_size) + transformers.masking_utils.chunked_overlay = ( + lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) ) def __exit__(self, exc_type, exc_value, traceback): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index a1bb6f4ce4..15e961751a 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -189,9 +189,9 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6, "flux": 4, "flux-fill": 4, - "lfm2": ( - 2 if is_openvino_version(">=", "2026.0") else 0 - ), # Tokenizers fail to convert on 2025.4, ticket: CVS-176880 + "lfm2": 2 + if is_openvino_version(">=", "2026.0") + else 0, # Tokenizers fail to convert on 2025.4, ticket: CVS-176880 "llava": 2, "mistral3": 2, "sana": 2, @@ -294,11 +294,9 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "int8", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - ( - {"encoder": 14, "decoder": 22, "decoder_with_past": 22} - if is_transformers_version("<=", "4.45") - else {"encoder": 14, "decoder": 22, "decoder_with_past": 25} - ), + {"encoder": 14, "decoder": 22, "decoder_with_past": 22} + if is_transformers_version("<=", "4.45") + else {"encoder": 14, "decoder": 22, "decoder_with_past": 25}, ( {"encoder": {"int8": 14}, "decoder": {"int8": 22}, "decoder_with_past": {"int8": 17}} if is_transformers_version("<=", "4.45") @@ -310,11 +308,9 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "f8e4m3", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - ( - {"encoder": 16, "decoder": 26, "decoder_with_past": 23} - if is_transformers_version("<=", "4.45") - else {"encoder": 16, "decoder": 26, "decoder_with_past": 25} - ), + {"encoder": 16, "decoder": 26, "decoder_with_past": 23} + if is_transformers_version("<=", "4.45") + else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}, ( {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}} if is_transformers_version("<=", "4.45") @@ -490,14 +486,12 @@ class OVCLIExportTestCase(unittest.TestCase): "t5", "int8", "--dataset c4:seq_len=64 --num-samples 1", - ( - {"encoder": 30, "decoder": 52, "decoder_with_past": 61} - if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52, - } - ), + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.45") + else { + "encoder": 30, + "decoder": 52, + }, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -769,9 +763,9 @@ class OVCLIExportTestCase(unittest.TestCase): 'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", { - "lm_model": ( - {"int8": 10, "int4": 20} if is_transformers_version(">=", "4.54") else {"int8": 6, "int4": 24} - ), + "lm_model": {"int8": 10, "int4": 20} + if is_transformers_version(">=", "4.54") + else {"int8": 6, "int4": 24}, "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 1}, "vision_embeddings_merger_model": {"int8": 12}, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index fb7a4dbdc6..8c4f8a9370 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -340,14 +340,12 @@ class OVQuantizerTest(unittest.TestCase): dataset="wikitext2:seq_len=64", num_samples=1, ), - ( - {"encoder": 30, "decoder": 52, "decoder_with_past": 61} - if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52, - } - ), + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.45") + else { + "encoder": 30, + "decoder": 52, + }, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -641,11 +639,9 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=32, ignored_scope={ "names": [ - ( - "__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" - if is_transformers_version("<", "4.57") - else "__module.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" - ) + "__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + if is_transformers_version("<", "4.57") + else "__module.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" ] }, ),