Skip to content

Commit

Permalink
add quantization_config argument for OVModel
Browse files Browse the repository at this point in the history
  • Loading branch information
echarlaix committed Feb 23, 2024
1 parent 925e56d commit bf91033
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 66 deletions.
28 changes: 8 additions & 20 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,8 @@ def export(
device (`str`, *optional*, defaults to `cpu`):
The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
export on CUDA devices.
compression_option (`Optional[str]`, defaults to `None`):
The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
`int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
compression_ratio (`Optional[float]`, defaults to `None`):
Compression ratio between primary and backup precision (only relevant to INT4).
ov_config (`OVConfig`, *optional*):
The configuration containing the parameters related to quantization.
input_shapes (`Optional[Dict]`, defaults to `None`):
If specified, allows to use specific shapes for the example input provided to the exporter.
stateful (`bool`, defaults to `True`):
Expand Down Expand Up @@ -233,11 +230,8 @@ def export_pytorch_via_onnx(
If specified, allows to use specific shapes for the example input provided to the exporter.
model_kwargs (optional[Dict[str, Any]], defaults to `None`):
Additional kwargs for model export.
compression_option (`Optional[str]`, defaults to `None`):
The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
`int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
compression_ratio (`Optional[float]`, defaults to `None`):
Compression ratio between primary and backup precision (only relevant to INT4).
ov_config (`OVConfig`, *optional*):
The configuration containing the parameters related to quantization.
Returns:
`Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
Expand Down Expand Up @@ -290,11 +284,8 @@ def export_pytorch(
If specified, allows to use specific shapes for the example input provided to the exporter.
model_kwargs (optional[Dict[str, Any]], defaults to `None`):
Additional kwargs for model export
compression_option (`Optional[str]`, defaults to `None`):
The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
`int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
compression_ratio (`Optional[float]`, defaults to `None`):
Compression ratio between primary and backup precision (only relevant to INT4).
ov_config (`OVConfig`, *optional*):
The configuration containing the parameters related to quantization.
stateful (`bool`, defaults to `False`):
Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
Expand Down Expand Up @@ -452,11 +443,8 @@ def export_models(
export on CUDA devices.
input_shapes (Optional[Dict], optional, Defaults to None):
If specified, allows to use specific shapes for the example input provided to the exporter.
compression_option (`Optional[str]`, defaults to `None`):
The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
`int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point.
compression_ratio (`Optional[int]`, defaults to `None`):
Compression ratio between primary and backup precision (only relevant to INT4).
ov_config (`OVConfig`, *optional*):
The configuration containing the parameters related to quantization.
model_kwargs (Optional[Dict[str, Any]], optional):
Additional kwargs for model export.
stateful (`bool`, defaults to `True`)
Expand Down
22 changes: 16 additions & 6 deletions optimum/intel/openvino/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, Union
from typing import Dict, Optional, Union

import numpy as np
import openvino
Expand Down Expand Up @@ -53,6 +53,7 @@

from ...exporters.openvino import main_export
from ..utils.import_utils import is_timm_available, is_timm_version
from .configuration import OVConfig, OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import _is_timm_ov_dir

Expand Down Expand Up @@ -427,14 +428,17 @@ def _from_transformers(
task: Optional[str] = None,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
load_in_4bit: Optional[bool] = None,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
save_dir = TemporaryDirectory()
save_dir_path = Path(save_dir.name)

# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
compression_option = "fp32" if load_in_8bit is not None else None
# If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
if load_in_8bit is None or not quantization_config:
ov_config = None
else:
ov_config = OVConfig(dtype="fp32")

# OVModelForFeatureExtraction works with Transformers type of models, thus even sentence-transformers models are loaded as such.
main_export(
Expand All @@ -448,12 +452,18 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option=compression_option,
ov_config=ov_config,
library_name="transformers",
)

config.save_pretrained(save_dir_path)
return cls._from_pretrained(model_id=save_dir_path, config=config, load_in_8bit=load_in_8bit, **kwargs)
return cls._from_pretrained(
model_id=save_dir_path,
config=config,
load_in_8bit=load_in_8bit,
quantization_config=quantization_config,
**kwargs,
)


MASKED_LM_EXAMPLE = r"""
Expand Down
38 changes: 27 additions & 11 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

from ...exporters.openvino import export, main_export
from ..utils.import_utils import is_nncf_available
from .configuration import OVConfig, OVWeightQuantizationConfig
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties


Expand Down Expand Up @@ -91,7 +92,7 @@ def __init__(
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None

@staticmethod
def load_model(file_name: Union[str, Path], load_in_8bit: bool = False):
def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
"""
Loads the model.
Expand All @@ -118,14 +119,15 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
if file_name.suffix == ".onnx":
model = fix_op_names_duplicates(model) # should be called during model conversion to IR

if load_in_8bit:
if quantization_config:
if not is_nncf_available():
raise ImportError(
"Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
)
import nncf

model = nncf.compress_weights(model)
from optimum.intel.openvino.quantization import _weight_only_quantization

model = _weight_only_quantization(model, quantization_config)

return model

Expand Down Expand Up @@ -155,6 +157,7 @@ def _from_pretrained(
from_onnx: bool = False,
local_files_only: bool = False,
load_in_8bit: bool = False,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
"""
Expand Down Expand Up @@ -199,7 +202,12 @@ def _from_pretrained(
subfolder=subfolder,
local_files_only=local_files_only,
)
model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

model = cls.load_model(model_cache_path, quantization_config=quantization_config)
return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs)

@staticmethod
Expand Down Expand Up @@ -252,6 +260,7 @@ def _from_transformers(
task: Optional[str] = None,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
"""
Expand All @@ -275,10 +284,11 @@ def _from_transformers(
save_dir = TemporaryDirectory()
save_dir_path = Path(save_dir.name)

# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
compression_option = None
if load_in_8bit is not None:
compression_option = "fp32"
# If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
if load_in_8bit is None or not quantization_config:
ov_config = None
else:
ov_config = OVConfig(dtype="fp32")

main_export(
model_name_or_path=model_id,
Expand All @@ -291,11 +301,17 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option=compression_option,
ov_config=ov_config,
)

config.save_pretrained(save_dir_path)
return cls._from_pretrained(model_id=save_dir_path, config=config, load_in_8bit=load_in_8bit, **kwargs)
return cls._from_pretrained(
model_id=save_dir_path,
config=config,
load_in_8bit=load_in_8bit,
quantization_config=quantization_config,
**kwargs,
)

@classmethod
def _to_load(
Expand Down
41 changes: 29 additions & 12 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from transformers.file_utils import add_start_docstrings

from ...exporters.openvino import main_export
from .configuration import OVConfig, OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import (
ONNX_DECODER_NAME,
Expand Down Expand Up @@ -111,6 +112,7 @@ def _from_pretrained(
use_cache: bool = True,
from_onnx: bool = False,
load_in_8bit: bool = False,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
"""
Expand Down Expand Up @@ -152,12 +154,19 @@ def _from_pretrained(
decoder_file_name = decoder_file_name or default_decoder_file_name
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

# Load model from a local directory
if os.path.isdir(model_id):
encoder = cls.load_model(os.path.join(model_id, encoder_file_name), load_in_8bit)
decoder = cls.load_model(os.path.join(model_id, decoder_file_name), load_in_8bit)
encoder = cls.load_model(os.path.join(model_id, encoder_file_name), quantization_config)
decoder = cls.load_model(os.path.join(model_id, decoder_file_name), quantization_config)
if use_cache:
decoder_with_past = cls.load_model(os.path.join(model_id, decoder_with_past_file_name), load_in_8bit)
decoder_with_past = cls.load_model(
os.path.join(model_id, decoder_with_past_file_name), quantization_config
)

model_save_dir = Path(model_id)

Expand Down Expand Up @@ -185,10 +194,10 @@ def _from_pretrained(
file_names[name] = model_cache_path

model_save_dir = Path(model_cache_path).parent
encoder = cls.load_model(file_names["encoder"], load_in_8bit)
decoder = cls.load_model(file_names["decoder"], load_in_8bit)
encoder = cls.load_model(file_names["encoder"], quantization_config)
decoder = cls.load_model(file_names["decoder"], quantization_config)
if use_cache:
decoder_with_past = cls.load_model(file_names["decoder_with_past"], load_in_8bit)
decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config)

return cls(
encoder=encoder,
Expand All @@ -214,6 +223,7 @@ def _from_transformers(
use_cache: bool = True,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
"""
Expand All @@ -240,13 +250,15 @@ def _from_transformers(

if task is None:
task = cls.export_feature

if use_cache:
task = task + "-with-past"

compression_option = None
if load_in_8bit is not None:
compression_option = "fp32"
# If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
if load_in_8bit is None or not quantization_config:
ov_config = None
else:
ov_config = OVConfig(dtype="fp32")

main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -258,12 +270,17 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option=compression_option,
ov_config=ov_config,
)

config.save_pretrained(save_dir_path)
return cls._from_pretrained(
model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=load_in_8bit, **kwargs
model_id=save_dir_path,
config=config,
use_cache=use_cache,
load_in_8bit=load_in_8bit,
quantization_config=quantization_config,
**kwargs,
)

def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_length: int, is_decoder=True):
Expand Down
22 changes: 13 additions & 9 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from ...exporters.openvino.stateful import model_has_state
from ..utils.import_utils import is_nncf_available
from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
from .configuration import OVWeightQuantizationConfig, _check_default_4bit_configs
from .configuration import OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE

Expand Down Expand Up @@ -252,14 +252,14 @@ def _from_transformers(

if task is None:
task = cls.export_feature

if use_cache:
task = task + "-with-past"

# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
compression_option = None
if load_in_8bit is not None or quantization_config is not None:
compression_option = "fp32"
# If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
if load_in_8bit is None or not quantization_config:
ov_config = None
else:
ov_config = OVConfig(dtype="fp32")

stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)

Expand All @@ -274,7 +274,7 @@ def _from_transformers(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
compression_option=compression_option,
ov_config=ov_config,
stateful=stateful,
)

Expand All @@ -285,8 +285,8 @@ def _from_transformers(
model_id=save_dir_path,
config=config,
use_cache=use_cache,
load_in_8bit=load_in_8bit,
stateful=None,
load_in_8bit=load_in_8bit,
quantization_config=quantization_config,
**kwargs,
)
Expand Down Expand Up @@ -576,11 +576,15 @@ def _from_pretrained(
local_files_only=local_files_only,
)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

if isinstance(quantization_config, dict):
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
model = cls.load_model(model_cache_path, load_in_8bit=False if load_in_4bit else load_in_8bit)
model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)

model_type = config.model_type.replace("_", "-")
if model_type == "bloom":
Expand Down
Loading

0 comments on commit bf91033

Please sign in to comment.