Skip to content

Commit

Permalink
Merge branch 'huggingface:main' into model_patcher
Browse files Browse the repository at this point in the history
  • Loading branch information
jiqing-feng authored Mar 6, 2024
2 parents 4c3335b + 5e319aa commit b1f704a
Show file tree
Hide file tree
Showing 12 changed files with 211 additions and 92 deletions.
7 changes: 5 additions & 2 deletions docs/source/inference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ By default the quantization scheme will be [assymmetric](https://github.com/open

For INT4 quantization you can also specify the following arguments :
* The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization.
* The `--ratio` CLI parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`.
* The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`.

Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency.

Expand All @@ -122,8 +122,11 @@ from optimum.intel import OVModelForCausalLM
model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
```

> **NOTE:** `load_in_8bit` is enabled by default for the models larger than 1 billion parameters.
<Tip warning={true}>

`load_in_8bit` is enabled by default for the models larger than 1 billion parameters.

</Tip>

To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization).

Expand Down
9 changes: 5 additions & 4 deletions docs/source/optimization_ov.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ save_dir = "ptq_model"
def preprocess_function(examples, tokenizer):
return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True)

# Load the default quantization configuration detailing the quantization we wish to apply
quantization_config = OVConfig()
# Instantiate our OVQuantizer using the desired configuration
quantizer = OVQuantizer.from_pretrained(model)
# Create the calibration dataset used to perform static quantization
Expand All @@ -52,7 +50,6 @@ calibration_dataset = quantizer.get_calibration_dataset(
)
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=save_dir,
)
Expand All @@ -72,7 +69,11 @@ from optimum.intel import OVModelForCausalLM
model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
```

> **NOTE:** `load_in_8bit` is enabled by default for models larger than 1 billion parameters.
<Tip warning={true}>

`load_in_8bit` is enabled by default for the models larger than 1 billion parameters.

</Tip>

For the 4-bit weight quantization you can use the `quantization_config` to specify the optimization parameters, for example:

Expand Down
6 changes: 3 additions & 3 deletions optimum/intel/generation/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,13 @@ def __init__(
self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
self.model_dtype = kwargs.get("model_dtype", None)

logger.warning(
f"The class `{self.__class__}` has been depreciated and will be removed in optimum-intel v1.14, please use IPEXModel instead"
)
if isinstance(model, torch.jit.ScriptModule):
self.input_names = {
inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self"
}
logger.warning(
f"The class `{self.__class__}` has been depreciated for TorchScript model, please use `IPEXModelForCausalLM` instead"
)
else:
self.input_names = set()

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(
**kwargs,
):
super().__init__()
self.compression = compression or DEFAULT_QUANTIZATION_CONFIG
self.compression = compression
self.input_info = input_info
self.save_onnx_model = save_onnx_model
self._enable_standard_onnx_export_option()
Expand Down
38 changes: 34 additions & 4 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
self.config = config
Expand Down Expand Up @@ -91,6 +92,10 @@ def __init__(

self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None

self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

@staticmethod
def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
"""
Expand Down Expand Up @@ -143,6 +148,15 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
openvino.save_model(self.model, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

def _save_openvino_config(self, save_directory: Union[str, Path]):
if self._openvino_config is not None:
if not isinstance(self._openvino_config.quantization_config.dataset, (str, type(None))):
self._openvino_config.quantization_config.dataset = None

self._openvino_config.save_pretrained(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -203,12 +217,28 @@ def _from_pretrained(
local_files_only=local_files_only,
)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

model = cls.load_model(model_cache_path, quantization_config=quantization_config)
return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
return cls(
model,
config=config,
model_save_dir=model_cache_path.parent,
quantization_config=quantization_config,
**kwargs,
)

@staticmethod
def _prepare_weight_quantization_config(
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, load_in_8bit: bool = False
):
# Give default quantization config if not provided and load_in_8bit=True
if not quantization_config and load_in_8bit:
quantization_config = OVWeightQuantizationConfig(bits=8)
elif isinstance(quantization_config, dict):
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)

return quantization_config

@staticmethod
def _cached_file(
Expand Down
11 changes: 8 additions & 3 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
self.config = config
Expand All @@ -76,6 +77,9 @@ def __init__(
self.decoder_model = decoder
self.decoder_with_past_model = decoder_with_past
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand All @@ -96,6 +100,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, dst_file_name)
openvino.save_model(src_file, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -155,9 +161,7 @@ def _from_pretrained(
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

# Load model from a local directory
if os.path.isdir(model_id):
Expand Down Expand Up @@ -205,6 +209,7 @@ def _from_pretrained(
decoder_with_past=decoder_with_past,
config=config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**kwargs,
)

Expand Down
24 changes: 15 additions & 9 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -100,6 +101,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
if not dynamic_shapes:
Expand All @@ -117,6 +119,7 @@ def __init__(
dynamic_shapes=False,
ov_config=ov_config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**kwargs,
)

Expand Down Expand Up @@ -224,6 +227,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
openvino.save_model(model_to_save, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

@classmethod
def _from_transformers(
cls,
Expand Down Expand Up @@ -576,15 +581,10 @@ def _from_pretrained(
local_files_only=local_files_only,
)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

if isinstance(quantization_config, dict):
if quantization_config == {"bits": 4} and config.name_or_path in _DEFAULT_4BIT_CONFIGS:
quantization_config = _DEFAULT_4BIT_CONFIGS[config.name_or_path]
if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config)

quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)
Expand All @@ -603,7 +603,12 @@ def _from_pretrained(

enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
causal_model = init_cls(
model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs
model=model,
config=config,
model_save_dir=model_cache_path.parent,
compile=enable_compilation,
quantization_config=quantization_config,
**kwargs,
)

if load_in_4bit:
Expand Down Expand Up @@ -632,6 +637,7 @@ def _from_pretrained(
# seqlen = get_seqlen(causal_model)
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

_weight_only_quantization(model, quantization_config)
Expand Down
36 changes: 28 additions & 8 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,25 @@ def __init__(
compile: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
self._internal_dict = config
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}
self._model_save_dir = (
Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
)

# This attribute is needed to keep one reference on the temporary directory, since garbage collecting
# would end-up removing the directory containing the underlying OpenVINO model
self._model_save_dir_tempdirectory_instance = None
if isinstance(model_save_dir, TemporaryDirectory):
self._model_save_dir_tempdirectory_instance = model_save_dir
self._model_save_dir = Path(model_save_dir.name)
elif isinstance(model_save_dir, str):
self._model_save_dir = Path(model_save_dir)
else:
self._model_save_dir = model_save_dir

self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
self.unet = OVModelUnet(unet, self)
self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
Expand Down Expand Up @@ -140,6 +150,10 @@ def __init__(

self._internal_dict.pop("vae", None)

self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Saves the model to the OpenVINO IR format so that it can be re-loaded using the
Expand Down Expand Up @@ -177,6 +191,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
if self.tokenizer_2 is not None:
self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")

self._save_openvino_config(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -257,10 +273,7 @@ def _from_pretrained(
else:
kwargs[name] = load_method(new_model_save_dir)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
unet = cls.load_model(
new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config
)
Expand All @@ -278,7 +291,14 @@ def _from_pretrained(
if model_save_dir is None:
model_save_dir = new_model_save_dir

return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
return cls(
unet=unet,
config=config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**components,
**kwargs,
)

@classmethod
def _from_transformers(
Expand Down
Loading

0 comments on commit b1f704a

Please sign in to comment.