Skip to content

Commit aa0e0c2

Browse files
Refactor how quantization is applied during optimum-cli
1 parent 906008d commit aa0e0c2

File tree

7 files changed

+68
-70
lines changed

7 files changed

+68
-70
lines changed

optimum/commands/export/openvino.py

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,6 @@ def run(self):
432432
ov_config = OVConfig(quantization_config=quantization_config)
433433

434434
quantization_config = ov_config.quantization_config if ov_config else None
435-
quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
436435
task = infer_task(self.args.task, self.args.model, library_name=library_name)
437436
# in some cases automatic task detection for multimodal models gives incorrect results
438437
if self.args.task == "auto" and library_name == "transformers":
@@ -448,7 +447,29 @@ def run(self):
448447
if getattr(config, "model_type", "") in MULTI_MODAL_TEXT_GENERATION_MODELS:
449448
task = "image-text-to-text"
450449

451-
if library_name == "diffusers" and quantize_with_dataset:
450+
if not quantization_config:
451+
# If no explicit quantization is requested, proceed to export only. That said, INT8 weight-only quantization
452+
# still will be applied if the model is large enough.
453+
# TODO : add input shapes
454+
main_export(
455+
model_name_or_path=self.args.model,
456+
output=self.args.output,
457+
task=self.args.task,
458+
framework=self.args.framework,
459+
cache_dir=self.args.cache_dir,
460+
trust_remote_code=self.args.trust_remote_code,
461+
pad_token_id=self.args.pad_token_id,
462+
ov_config=ov_config,
463+
stateful=not self.args.disable_stateful,
464+
convert_tokenizer=not self.args.disable_convert_tokenizer,
465+
library_name=library_name,
466+
variant=self.args.variant,
467+
model_kwargs=self.args.model_kwargs,
468+
# **input_shapes,
469+
)
470+
return
471+
472+
if library_name == "diffusers":
452473
if not is_diffusers_available():
453474
raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
454475

@@ -490,21 +511,8 @@ def run(self):
490511
else:
491512
raise NotImplementedError(f"Quantization isn't supported for class {class_name}.")
492513

493-
model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
494-
model.save_pretrained(self.args.output)
495-
if not self.args.disable_convert_tokenizer:
496-
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
497-
elif (
498-
quantize_with_dataset
499-
and (
500-
task in ["fill-mask", "zero-shot-image-classification"]
501-
or task.startswith("text-generation")
502-
or task.startswith("text2text-generation")
503-
or task.startswith("automatic-speech-recognition")
504-
or task.startswith("feature-extraction")
505-
)
506-
or (task == "image-text-to-text" and quantization_config is not None)
507-
):
514+
model = model_cls.from_pretrained(self.args.model, export=True, load_in_8bit=False, compile=False)
515+
else:
508516
if task.startswith("text-generation"):
509517
from optimum.intel import OVModelForCausalLM
510518

@@ -542,40 +550,31 @@ def run(self):
542550
f"Unable to find a matching model class for the task={task} and library_name={library_name}."
543551
)
544552

545-
# In this case, to apply quantization an instance of a model class is required
546553
model = model_cls.from_pretrained(
547554
self.args.model,
548555
export=True,
549-
quantization_config=quantization_config,
556+
compile=False,
557+
load_in_8bit=False,
550558
stateful=not self.args.disable_stateful,
551559
trust_remote_code=self.args.trust_remote_code,
552560
variant=self.args.variant,
553561
cache_dir=self.args.cache_dir,
554562
)
555-
model.save_pretrained(self.args.output)
556563

564+
from optimum.intel import OVConfig, OVQuantizer
565+
566+
OVQuantizer(model).quantize(
567+
ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output
568+
)
569+
570+
if library_name == "diffusers":
571+
if not self.args.disable_convert_tokenizer:
572+
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
573+
else:
557574
preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
558575
save_preprocessors(preprocessors, model.config, self.args.output, self.args.trust_remote_code)
559576
if not self.args.disable_convert_tokenizer:
560577
maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
561-
else:
562-
# TODO : add input shapes
563-
main_export(
564-
model_name_or_path=self.args.model,
565-
output=self.args.output,
566-
task=self.args.task,
567-
framework=self.args.framework,
568-
cache_dir=self.args.cache_dir,
569-
trust_remote_code=self.args.trust_remote_code,
570-
pad_token_id=self.args.pad_token_id,
571-
ov_config=ov_config,
572-
stateful=not self.args.disable_stateful,
573-
convert_tokenizer=not self.args.disable_convert_tokenizer,
574-
library_name=library_name,
575-
variant=self.args.variant,
576-
model_kwargs=self.args.model_kwargs,
577-
# **input_shapes,
578-
)
579578

580579

581580
def prepare_wc_config(args, default_configs):

optimum/intel/openvino/modeling_base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -698,11 +698,11 @@ def _export(
698698
)
699699
compile_only = False
700700

701-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
702-
if load_in_8bit is None and not quantization_config:
701+
ov_config = OVConfig(dtype="auto")
702+
if load_in_8bit is None and quantization_config is None:
703+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
704+
# models larger than 1B parameters will be quantized to int8
703705
ov_config = None
704-
else:
705-
ov_config = OVConfig(dtype="fp32")
706706

707707
variant = kwargs.pop("variant", None)
708708

optimum/intel/openvino/modeling_decoder.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,11 +323,11 @@ def _export(
323323
if use_cache:
324324
task = task + "-with-past"
325325

326-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
327-
if load_in_8bit is None and not quantization_config:
328-
ov_export_config = None
329-
else:
330-
ov_export_config = OVConfig(dtype="auto")
326+
ov_config = OVConfig(dtype="auto")
327+
if load_in_8bit is None and quantization_config is None:
328+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
329+
# models larger than 1B parameters will be quantized to int8
330+
ov_config = None
331331

332332
stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
333333

@@ -351,7 +351,7 @@ def _export(
351351
local_files_only=local_files_only,
352352
force_download=force_download,
353353
trust_remote_code=trust_remote_code,
354-
ov_config=ov_export_config,
354+
ov_config=ov_config,
355355
stateful=stateful,
356356
model_loading_kwargs=model_loading_kwargs,
357357
library_name=cls._library_name,

optimum/intel/openvino/modeling_diffusion.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -614,12 +614,11 @@ def _export(
614614
)
615615
compile_only = False
616616

617-
# If load_in_8bit and quantization_config not specified then ov_config is set
618-
# to None and will be set by default in convert depending on the model size
619-
if load_in_8bit is None and not quantization_config:
617+
ov_config = OVConfig(dtype="auto")
618+
if load_in_8bit is None and quantization_config is None:
619+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
620+
# models larger than 1B parameters will be quantized to int8
620621
ov_config = None
621-
else:
622-
ov_config = OVConfig(dtype="auto")
623622

624623
torch_dtype = kwargs.pop("torch_dtype", None)
625624

optimum/intel/openvino/modeling_open_clip.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,11 @@ def _export(
245245
# would end-up removing the directory containing the underlying OpenVINO model
246246
cls._model_save_dir_tempdirectory_instance = save_dir
247247

248-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
249-
if load_in_8bit is None and not quantization_config:
248+
ov_config = OVConfig(dtype="auto")
249+
if load_in_8bit is None and quantization_config is None:
250+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
251+
# models larger than 1B parameters will be quantized to int8
250252
ov_config = None
251-
else:
252-
ov_config = OVConfig(dtype="fp32")
253253

254254
def fn_get_submodels(model):
255255
return {"model_text": model.text}
@@ -370,11 +370,11 @@ def _export(
370370
# would end-up removing the directory containing the underlying OpenVINO model
371371
cls._model_save_dir_tempdirectory_instance = save_dir
372372

373-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
374-
if load_in_8bit is None and not quantization_config:
373+
ov_config = OVConfig(dtype="auto")
374+
if load_in_8bit is None and quantization_config is None:
375+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
376+
# models larger than 1B parameters will be quantized to int8
375377
ov_config = None
376-
else:
377-
ov_config = OVConfig(dtype="fp32")
378378

379379
def fn_get_submodels(model):
380380
return {"model_vision": model.visual}

optimum/intel/openvino/modeling_seq2seq.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -593,11 +593,11 @@ def _export(
593593
"Please provide openvino model obtained using optimum-cli or saved on disk using `save_pretrained`"
594594
)
595595
compile_only = False
596-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
597-
if load_in_8bit is None and not quantization_config:
596+
ov_config = OVConfig(dtype="auto")
597+
if load_in_8bit is None and quantization_config is None:
598+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
599+
# models larger than 1B parameters will be quantized to int8
598600
ov_config = None
599-
else:
600-
ov_config = OVConfig(dtype="fp32")
601601
stateful = kwargs.get("stateful", True)
602602
variant = kwargs.pop("variant", None)
603603

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ def _from_pretrained(
611611
if to_quantize:
612612
from optimum.intel.openvino.quantization import OVQuantizer
613613

614+
print("!!!")
614615
quantization_config_copy = copy.deepcopy(quantization_config)
615616
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
616617
potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
@@ -655,12 +656,11 @@ def _export(
655656
if task is None:
656657
task = cls.export_feature
657658

658-
# If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
659-
if load_in_8bit is None and not quantization_config:
659+
ov_config = OVConfig(dtype="auto")
660+
if load_in_8bit is None and quantization_config is None:
661+
# If load_in_8bit and quantization_config are not specified then ov_config is set to None, and
662+
# models larger than 1B parameters will be quantized to int8
660663
ov_config = None
661-
else:
662-
# Export in fp32 if compression won't be applied later
663-
ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")
664664

665665
stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
666666
variant = kwargs.pop("variant", None)

0 commit comments

Comments
 (0)