huggingface
diff --git a/‎optimum/commands/export/openvino.py
Lines changed: 11 additions & 4 deletions b/‎optimum/commands/export/openvino.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎optimum/intel/openvino/configuration.py
Lines changed: 17 additions & 8 deletions b/‎optimum/intel/openvino/configuration.py
Lines changed: 17 additions & 8 deletions
diff --git a/‎optimum/intel/openvino/modeling_visual_language.py
Lines changed: 111 additions & 23 deletions b/‎optimum/intel/openvino/modeling_visual_language.py
Lines changed: 111 additions & 23 deletions
@@ -329,11 +329,18 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif task.startswith("text-generation") and quantize_with_dataset:
-            from optimum.intel import OVModelForCausalLM
+        elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
+            if task.startswith("text-generation"):
+                from optimum.intel import OVModelForCausalLM
 
-            # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
-            model = OVModelForCausalLM.from_pretrained(
+                model_cls = OVModelForCausalLM
+            else:
+                from optimum.intel import OVModelForVisualCausalLM
+
+                model_cls = OVModelForVisualCausalLM
+
+            # To quantize a model with a dataset, an instance of a model class is required
+            model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,
                 quantization_config=quantization_config,
 
@@ -26,6 +26,7 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
+from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
 
 
 if is_nncf_available():
@@ -350,6 +351,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
             difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
+        processor (`str`, *optional*):
+            A transformers processor used to process inputs for multi-modal models. You can pass either:
+                - A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
+                - A path to a *directory* containing files required by the processor, for instance saved
+                    using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
     """
 
     def __init__(
@@ -369,6 +375,7 @@ def __init__(
         scale_estimation: bool = None,
         weight_format: Optional[str] = None,
         gptq: bool = None,
+        processor: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -383,6 +390,7 @@ def __init__(
         self.scale_estimation = scale_estimation
         self.weight_format = weight_format
         self.gptq = gptq
+        self.processor = processor
         self.post_init()
 
     def post_init(self):
@@ -400,16 +408,14 @@ def post_init(self):
                 f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
             )
         if self.dataset is not None and isinstance(self.dataset, str):
-            llm_datasets = ["wikitext2", "c4", "c4-new"]
-            stable_diffusion_datasets = [
-                "conceptual_captions",
-                "laion/220k-GPT4Vision-captions-from-LIVIS",
-                "laion/filtered-wit",
-            ]
-            if self.dataset not in llm_datasets + stable_diffusion_datasets:
+            lm_datasets = ["wikitext2", "c4", "c4-new"]
+            visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
+            stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
+            if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
                 raise ValueError(
                     f"""You have entered a string value for dataset. You can only choose between
-                    {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
+                    {lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
+                    or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
         if self.bits not in [4, 8]:
@@ -444,6 +450,9 @@ def post_init(self):
         if self.tokenizer is not None and not isinstance(self.tokenizer, str):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
+        if self.processor is not None and not isinstance(self.processor, str):
+            raise ValueError(f"Processor is expected to be a string, but found {self.processor}")
+
         if self.weight_format is None:
             self.weight_format = "int4" if self.bits == 4 else "int8"
         if self.weight_format not in ["int4", "int8", "mxfp4"]:
 
@@ -1,6 +1,8 @@
+import copy
 import logging
 import os
 import warnings
+from abc import abstractmethod
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Union
 
@@ -10,11 +12,19 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from transformers import AutoConfig, GenerationConfig, GenerationMixin, PretrainedConfig
+from PIL.Image import Image
+from transformers import (
+    AutoConfig,
+    GenerationConfig,
+    GenerationMixin,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 
 from ...exporters.openvino import main_export
 from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
+from .. import OVQuantizer
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
 from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
@@ -181,6 +191,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values"
 
     def forward(self, pixel_values, **kwargs):
+        self._compile()
         inputs = {self._main_input: pixel_values}
         if len(self.input_names) > 1:
             for name in self.input_names:
@@ -210,6 +221,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
 
     def forward(self, image_feature, pos_embed, key_padding_mask):
+        self._compile()
         result = self.request(
             {"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
         )[0]
@@ -244,7 +256,7 @@ def __init__(
         self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
         self.lm_model = language_model
-        self.text_embdings_model = text_embeddings
+        self.text_embeddings_model = text_embeddings
         self.vision_embeddings_model = vision_embeddings
         self._supports_cache_class = False
         self.main_input_name = "input_ids"
@@ -261,13 +273,13 @@ def __init__(
         self._set_ov_config_parameters()
         self.language_model = OVModelWithEmbedForCausalLM(
             self.lm_model,
-            self.text_embdings_model,
+            self.text_embeddings_model,
             config=config,
             deivce=device,
             ov_config=ov_config,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            compile=not self._compile_only,
+            compile=not self._compile_only and enable_compilation,
             compile_only=self._compile_only,
         )
         self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
@@ -287,6 +299,18 @@ def __init__(
         except AttributeError:
             pass
 
+    def clear_requests(self):
+        if self._compile_only:
+            raise ValueError(
+                "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
+            )
+
+        self.language_model.clear_requests()
+        components = [self.vision_embeddings] + [getattr(self, part) for part in self.additional_parts]
+        for component in components:
+            if component is not None:
+                component.request = None
+
     def compile(self):
         self.language_model.compile()
         self.vision_embeddings._compile()
@@ -304,11 +328,11 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             save_directory (`str` or `Path`):
                 The directory where to save the model files.
         """
-        src_files = [self.lm_model, self.text_embdings_model, self.vision_embeddings_model]
+        src_files = [self.lm_model, self.text_embeddings_model, self.vision_embeddings_model]
         dst_file_names = [
             "openvino_language_model.xml",
             "openvino_text_embeddings_model.xml",
-            "openvino_vision_embeddings.xml",
+            "openvino_vision_embeddings_model.xml",
         ]
         for part in self.additional_parts:
             model = getattr(self, f"{part}_model", None)
@@ -387,26 +411,18 @@ def _from_pretrained(
                 raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
             token = use_auth_token
 
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
-        compile_only = kwargs.get("compile_only", False)
-
-        # Load model from a local directory
-        if os.path.isdir(model_id):
-            model_save_dir = Path(model_id)
         model_file_names = {
             "language_model": "openvino_language_model.xml",
             "text_embeddings": "openvino_text_embeddings_model.xml",
             "vision_embeddings": "openvino_vision_embeddings_model.xml",
         }
 
+        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
         for part in model_cls.additional_parts:
             model_file_names[part] = f"openvino_{part}_model.xml"
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
         compile_only = kwargs.get("compile_only", False)
         if os.path.isdir(model_id):
+            # Load model from a local directory
             model_save_dir = Path(model_id)
             file_names = {k: os.path.join(model_id, model_file_names[k]) for k in model_file_names}
         else:
@@ -424,11 +440,11 @@ def _from_pretrained(
                 file_names[name] = model_cache_path
             model_save_dir = Path(model_cache_path).parent
         if not compile_only:
-            language_model = model_cls.load_model(file_names["language_model"], quantization_config)
-            text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
-            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"], quantization_config)
+            language_model = model_cls.load_model(file_names["language_model"])
+            text_embeddings = model_cls.load_model(file_names["text_embeddings"])
+            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"])
             for part in model_cls.additional_parts:
-                kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
+                kwargs[part] = model_cls.load_model(file_names[part])
         else:
             language_model = model_cls._compile_model(
                 file_names["language_model"],
@@ -468,7 +484,12 @@ def _from_pretrained(
         except Exception:
             pass
 
-        return model_cls(
+        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+        to_quantize = not compile_only and quantization_config is not None
+        if to_quantize:
+            kwargs["compile"] = False
+
+        model = model_cls(
             language_model=language_model,
             text_embeddings=text_embeddings,
             vision_embeddings=vision_embeddings,
@@ -478,6 +499,15 @@ def _from_pretrained(
             **kwargs,
         )
 
+        if to_quantize:
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
+            potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
+            quantization_config_copy.processor = quantization_config.processor or potential_processor_id
+            OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+
+        return model
+
     @classmethod
     def _from_transformers(
         cls,
@@ -556,8 +586,8 @@ def half(self):
         """
         apply_moc_transformations(self.lm_model, cf=False)
         compress_model_transformation(self.lm_model)
-        apply_moc_transformations(self.text_embdings_model, cf=False)
-        compress_model_transformation(self.text_embdings_model)
+        apply_moc_transformations(self.text_embeddings_model, cf=False)
+        compress_model_transformation(self.text_embeddings_model)
         apply_moc_transformations(self.vision_embeddings_model, cf=False)
         compress_model_transformation(self.vision_embeddings_model)
         for part in self.additional_parts:
@@ -695,6 +725,18 @@ def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
         return True
 
+    @staticmethod
+    @abstractmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        """
+        Preprocess input instruction and an image.
+        """
+
 
 class _OVLlavaForCausalLM(OVModelForVisualCausalLM):
     def __init__(
@@ -858,6 +900,20 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
         position_ids[attention_mask == 0] = 1
         return attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if image is None:
+            raise ValueError("Image is required.")
+        chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}]
+        prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        return inputs
+
 
 class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM):
     # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655
@@ -1372,6 +1428,19 @@ def merge_vision_text_embeddings(
                     )
         return vllm_embedding, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if image is None:
+            raise ValueError("Image is required.")
+        prompt = f"<|im_start|>user\n(<image>./</image>)\n{text}<|im_end|>\n<|im_start|>assistant\n"
+        inputs = processor([prompt], [image], return_tensors="pt")
+        return inputs
+
 
 class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM):
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1544,6 +1613,25 @@ def get_multimodal_embeddings(
 
         return new_input_embeds, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
+        messages = [{"role": "user", "content": f"<image>\n{text}"}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
+        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+        attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
+        result = {"input_ids": input_ids, "attention_mask": attention_mask}
+        if image is not None:
+            result["images"] = torch.unsqueeze(processor(images=image, return_tensors="pt")["pixel_values"][0], 0)
+        return result
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,