Refactor how quantization is applied during optimum-cli

nikita-savelyevv · nikita-savelyevv · commit aa0e0c258d41 · 2025-11-19T13:28:35.000+01:00
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -432,7 +432,6 @@ def run(self):
             ov_config = OVConfig(quantization_config=quantization_config)
 
         quantization_config = ov_config.quantization_config if ov_config else None
-        quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
         task = infer_task(self.args.task, self.args.model, library_name=library_name)
         # in some cases automatic task detection for multimodal models gives incorrect results
         if self.args.task == "auto" and library_name == "transformers":
@@ -448,7 +447,29 @@ def run(self):
             if getattr(config, "model_type", "") in MULTI_MODAL_TEXT_GENERATION_MODELS:
                 task = "image-text-to-text"
 
-        if library_name == "diffusers" and quantize_with_dataset:
+        if not quantization_config:
+            # If no explicit quantization is requested, proceed to export only. That said, INT8 weight-only quantization
+            # still will be applied if the model is large enough.
+            # TODO : add input shapes
+            main_export(
+                model_name_or_path=self.args.model,
+                output=self.args.output,
+                task=self.args.task,
+                framework=self.args.framework,
+                cache_dir=self.args.cache_dir,
+                trust_remote_code=self.args.trust_remote_code,
+                pad_token_id=self.args.pad_token_id,
+                ov_config=ov_config,
+                stateful=not self.args.disable_stateful,
+                convert_tokenizer=not self.args.disable_convert_tokenizer,
+                library_name=library_name,
+                variant=self.args.variant,
+                model_kwargs=self.args.model_kwargs,
+                # **input_shapes,
+            )
+            return
+
+        if library_name == "diffusers":
             if not is_diffusers_available():
                 raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
 
@@ -490,21 +511,8 @@ def run(self):
             else:
                 raise NotImplementedError(f"Quantization isn't supported for class {class_name}.")
 
-            model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
-            model.save_pretrained(self.args.output)
-            if not self.args.disable_convert_tokenizer:
-                maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif (
-            quantize_with_dataset
-            and (
-                task in ["fill-mask", "zero-shot-image-classification"]
-                or task.startswith("text-generation")
-                or task.startswith("text2text-generation")
-                or task.startswith("automatic-speech-recognition")
-                or task.startswith("feature-extraction")
-            )
-            or (task == "image-text-to-text" and quantization_config is not None)
-        ):
+            model = model_cls.from_pretrained(self.args.model, export=True, load_in_8bit=False, compile=False)
+        else:
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM
 
@@ -542,40 +550,31 @@ def run(self):
                     f"Unable to find a matching model class for the task={task} and library_name={library_name}."
                 )
 
-            # In this case, to apply quantization an instance of a model class is required
             model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,
-                quantization_config=quantization_config,
+                compile=False,
+                load_in_8bit=False,
                 stateful=not self.args.disable_stateful,
                 trust_remote_code=self.args.trust_remote_code,
                 variant=self.args.variant,
                 cache_dir=self.args.cache_dir,
             )
-            model.save_pretrained(self.args.output)
 
+        from optimum.intel import OVConfig, OVQuantizer
+
+        OVQuantizer(model).quantize(
+            ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output
+        )
+
+        if library_name == "diffusers":
+            if not self.args.disable_convert_tokenizer:
+                maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
+        else:
             preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
             save_preprocessors(preprocessors, model.config, self.args.output, self.args.trust_remote_code)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
-        else:
-            # TODO : add input shapes
-            main_export(
-                model_name_or_path=self.args.model,
-                output=self.args.output,
-                task=self.args.task,
-                framework=self.args.framework,
-                cache_dir=self.args.cache_dir,
-                trust_remote_code=self.args.trust_remote_code,
-                pad_token_id=self.args.pad_token_id,
-                ov_config=ov_config,
-                stateful=not self.args.disable_stateful,
-                convert_tokenizer=not self.args.disable_convert_tokenizer,
-                library_name=library_name,
-                variant=self.args.variant,
-                model_kwargs=self.args.model_kwargs,
-                # **input_shapes,
-            )
 
 
 def prepare_wc_config(args, default_configs):
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -698,11 +698,11 @@ def _export(
             )
             compile_only = False
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            ov_config = OVConfig(dtype="fp32")
 
         variant = kwargs.pop("variant", None)
 
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -323,11 +323,11 @@ def _export(
             if use_cache:
                 task = task + "-with-past"
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
-            ov_export_config = None
-        else:
-            ov_export_config = OVConfig(dtype="auto")
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
+            ov_config = None
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
 
@@ -351,7 +351,7 @@ def _export(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
-            ov_config=ov_export_config,
+            ov_config=ov_config,
             stateful=stateful,
             model_loading_kwargs=model_loading_kwargs,
             library_name=cls._library_name,
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -614,12 +614,11 @@ def _export(
             )
             compile_only = False
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set
-        # to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            ov_config = OVConfig(dtype="auto")
 
         torch_dtype = kwargs.pop("torch_dtype", None)
 
diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py
@@ -245,11 +245,11 @@ def _export(
         # would end-up removing the directory containing the underlying OpenVINO model
         cls._model_save_dir_tempdirectory_instance = save_dir
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            ov_config = OVConfig(dtype="fp32")
 
         def fn_get_submodels(model):
             return {"model_text": model.text}
@@ -370,11 +370,11 @@ def _export(
         # would end-up removing the directory containing the underlying OpenVINO model
         cls._model_save_dir_tempdirectory_instance = save_dir
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            ov_config = OVConfig(dtype="fp32")
 
         def fn_get_submodels(model):
             return {"model_vision": model.visual}
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
@@ -593,11 +593,11 @@ def _export(
                 "Please provide openvino model obtained using optimum-cli or saved on disk using `save_pretrained`"
             )
             compile_only = False
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            ov_config = OVConfig(dtype="fp32")
         stateful = kwargs.get("stateful", True)
         variant = kwargs.pop("variant", None)
 
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -611,6 +611,7 @@ def _from_pretrained(
         if to_quantize:
             from optimum.intel.openvino.quantization import OVQuantizer
 
+            print("!!!")
             quantization_config_copy = copy.deepcopy(quantization_config)
             quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
             potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
@@ -655,12 +656,11 @@ def _export(
         if task is None:
             task = cls.export_feature
 
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
+        ov_config = OVConfig(dtype="auto")
+        if load_in_8bit is None and quantization_config is None:
+            # If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
+            # models larger than 1B parameters will be quantized to int8
             ov_config = None
-        else:
-            # Export in fp32 if compression won't be applied later
-            ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
         variant = kwargs.pop("variant", None)