Skip to content

Commit

Permalink
apply review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
l-bat committed Mar 6, 2024
1 parent 4f7e87c commit bf4080a
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 79 deletions.
28 changes: 13 additions & 15 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
bits (`int`, defaults to 8):
The number of bits to quantize to.
sym (`bool`, *optional*, defaults to `False`):
sym (`bool`, defaults to `False`):
Whether to use symetric quantization.
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
The tokenizer used to process the dataset. You can pass either:
Expand All @@ -177,26 +177,24 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
user or organization name, like `dbmdz/bert-base-german-cased`.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
dataset (`Union[List[str]]`, *optional*):
dataset (`str or List[str]`, *optional*):
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs or
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models
group_size (`int`, *optional*, defaults to 128):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
ratio (`float`, *optional*, defaults to 1.0):
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models.
ratio (`float`, defaults to 1.0):
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
and the rest to INT8_ASYM).
group_size (`int`, *optional*):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
sensitivity_metric (`str`, *optional*):
The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
awq (`bool`, *optional*):
Enables AWQ method to unify weight ranges and improve overall model accuracy.
ignored_scope (`nncf.IgnoredScope`, *optional*):
ignored_scope (`dict`, *optional*):
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
subset_size (`int`, *optional*, defaults to 128):
Number of data samples to calculate activation statistics.
num_samples (`int`, *optional*):
The maximum number of samples composing the calibration dataset.
"""

Expand All @@ -205,13 +203,13 @@ def __init__(
bits: int = 8,
sym: bool = False,
tokenizer: Optional[Any] = None,
dataset: Optional[str] = None,
dataset: Optional[Union[str, List[str]]] = None,
ratio: float = 1.0,
group_size: Optional[int] = None,
all_layers: Optional[bool] = None,
sensitivity_metric: Optional[str] = None,
ignored_scope: Optional[dict] = None,
subset_size: int = 128,
num_samples: Optional[int] = None,
**kwargs,
):
self.bits = bits
Expand All @@ -223,7 +221,7 @@ def __init__(
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.ignored_scope = ignored_scope
self.subset_size = subset_size
self.num_samples = num_samples
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
self.post_init()

Expand Down
3 changes: 2 additions & 1 deletion optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,8 @@ def _from_pretrained(
# from optimum.gptq.utils import get_seqlen

# seqlen = get_seqlen(causal_model)
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32)
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
Expand Down
57 changes: 27 additions & 30 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import math
import os
import shutil
from copy import deepcopy
from pathlib import Path
from tempfile import TemporaryDirectory, gettempdir
from typing import Any, Dict, List, Optional, Union
Expand All @@ -35,7 +36,6 @@
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
from huggingface_hub import snapshot_download
from nncf import Dataset
from openvino._offline_transformations import compress_model_transformation
from openvino.runtime import Core
from transformers import CLIPFeatureExtractor, CLIPTokenizer
Expand Down Expand Up @@ -276,17 +276,15 @@ def _from_pretrained(
kwargs[name] = load_method(new_model_save_dir)

quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

dataset = None
if quantization_config:
dataset = quantization_config.dataset
quantization_config.dataset = None # apply weight compression without dataset

weight_quantization_config = deepcopy(quantization_config)
unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
if quantization_config and dataset is None:
unet = cls.load_model(unet_path, quantization_config)
else:
if weight_quantization_config is not None and weight_quantization_config.dataset is not None:
# load the UNet model uncompressed to apply hybrid quantization further
unet = cls.load_model(unet_path)
# Apply weights compression to other `components` without dataset
weight_quantization_config.dataset = None
else:
unet = cls.load_model(unet_path, quantization_config)

components = {
"vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
Expand All @@ -296,12 +294,12 @@ def _from_pretrained(
}

for key, value in components.items():
components[key] = cls.load_model(value, quantization_config) if value.is_file() else None
components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None

if model_save_dir is None:
model_save_dir = new_model_save_dir

if quantization_config and dataset is not None:
if quantization_config and quantization_config.dataset is not None:
sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)

supported_pipelines = (
Expand All @@ -313,23 +311,23 @@ def _from_pretrained(
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")

num_inference_steps = 4 if isinstance(sd_model, OVLatentConsistencyModelPipeline) else 50
quantization_config.dataset = dataset
nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
dataset = deepcopy(quantization_config.dataset)

if isinstance(quantization_config.dataset, str):
if isinstance(dataset, str):
from .quantization import get_stable_diffusion_dataset

dataset_name = quantization_config.dataset
num_samples = math.ceil(quantization_config.subset_size / num_inference_steps)
quantization_config.dataset = get_stable_diffusion_dataset(dataset_name, num_samples)
num_unet_runs = math.ceil(nsamples / num_inference_steps)
dataset = get_stable_diffusion_dataset(dataset, num_unet_runs)

unet_inputs = sd_model.prepare_inputs(
quantization_config.dataset, quantization_config.subset_size, num_inference_steps
)
quantization_config.dataset = unet_inputs
unet_inputs = sd_model._prepare_unet_inputs(dataset, nsamples, num_inference_steps)

from .quantization import _hybrid_quantization

unet = _hybrid_quantization(sd_model.unet.model, quantization_config)
hybrid_quantization_config = deepcopy(quantization_config)
hybrid_quantization_config.dataset = unet_inputs
hybrid_quantization_config.num_samples = nsamples
unet = _hybrid_quantization(sd_model.unet.model, hybrid_quantization_config)

return cls(
unet=unet,
Expand All @@ -340,27 +338,26 @@ def _from_pretrained(
**kwargs,
)

def prepare_inputs(
def _prepare_unet_inputs(
self,
dataset: Dataset,
subset_size: int,
dataset: List[str],
num_samples: int,
num_inference_steps: int,
height: Optional[int] = 512,
width: Optional[int] = 512,
**kwargs,
) -> Dataset:
) -> Dict[str, Any]:
self.compile()
calibration_data = []

from .quantization import InferRequestWrapper

self.unet.request = InferRequestWrapper(self.unet.request, calibration_data)
for prompt in dataset.get_inference_data():
for prompt in dataset:
_ = self.__call__(prompt, num_inference_steps=num_inference_steps, height=height, width=width)
if len(calibration_data) >= subset_size:
break

self.unet.request = self.unet.request.request
return Dataset(calibration_data)
return calibration_data[:num_samples]

@classmethod
def _from_transformers(
Expand Down
64 changes: 37 additions & 27 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from collections import deque
from copy import deepcopy
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

import nncf
import openvino
Expand Down Expand Up @@ -548,7 +548,7 @@ def _remove_unused_columns(self, dataset: "Dataset"):

def _weight_only_quantization(
model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]
):
) -> openvino.runtime.Model:
config = quantization_config
if isinstance(config, dict):
config = OVWeightQuantizationConfig.from_dict(quantization_config)
Expand All @@ -562,7 +562,8 @@ def _weight_only_quantization(

from optimum.gptq.data import get_dataset, prepare_dataset

dataset = get_dataset(config.dataset, tokenizer, seqlen=32)
nsamples = config.num_samples if config.num_samples else 128
dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)

sensitivity_metric = None
Expand All @@ -588,7 +589,7 @@ def _weight_only_quantization(
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
ignored_scope=ignored_scope,
dataset=dataset,
# subset_size=config.subset_size, # TODO : enable from nncf v2.9.0
# subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0
)


Expand Down Expand Up @@ -639,7 +640,7 @@ def _collect_ops_with_weights(model):

def get_stable_diffusion_dataset(
dataset_name: str, nsamples: int = 50, seed: int = 0, text_column: str = "caption"
) -> nncf.Dataset:
) -> List[str]:
if dataset_name not in ["conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", "laion/filtered-wit"]:
raise ValueError(
f"""You have entered a string value for dataset. You can only choose between
Expand All @@ -649,37 +650,46 @@ def get_stable_diffusion_dataset(

data = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=seed).take(nsamples)
dataset = [batch[text_column] for batch in data]
return nncf.Dataset(dataset)
return dataset


def _hybrid_quantization(model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]):
dataset = quantization_config.dataset
wc_ignored_scope = deepcopy(quantization_config.ignored_scope)

if isinstance(wc_ignored_scope, dict):
wc_ignored_scope["types"] = wc_ignored_scope.get("types", []) + ["Convolution"]
else:
assert wc_ignored_scope is None
wc_ignored_scope = {"types": ["Convolution"]}
def _hybrid_quantization(
model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig
) -> openvino.runtime.Model:
"""
Quantize a model in hybrid mode with NNCF which means that we quantize:
weights of MatMul and Embedding layers and activations of other layers.
The optimization specifications defined in `quantization_config`.
Args:
model (`openvino.runtime.Model`):
The OpenVINO Runtime model for applying hybrid quantization.
quantization_config (`OVWeightQuantizationConfig`):
The configuration containing the parameters related to quantization.
Returns:
The OpenVINO Runtime model with applied hybrid quantization.
"""
ignored_scope = quantization_config.ignored_scope if quantization_config.ignored_scope is not None else {}

ops_to_compress = _collect_ops_with_weights(model)
ptq_ignored_scope = deepcopy(quantization_config.ignored_scope)
if isinstance(ptq_ignored_scope, dict):
ptq_ignored_scope["names"] = ptq_ignored_scope.get("names", []) + ops_to_compress
else:
assert ptq_ignored_scope is None
ptq_ignored_scope = {"names": ops_to_compress}
ptq_ignored_scope = deepcopy(ignored_scope)
ptq_ignored_scope["names"] = ignored_scope.get("names", []) + ops_to_compress

quantization_config.dataset = None # Apply Weight Compression without dataset
quantization_config.ignored_scope = wc_ignored_scope
compressed_model = _weight_only_quantization(model, quantization_config)
wc_quantization_config = deepcopy(quantization_config)
wc_quantization_config.ignored_scope = ignored_scope
wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"]
# Apply Weight Compression without dataset
wc_quantization_config.dataset = None
compressed_model = _weight_only_quantization(model, wc_quantization_config)

subset_size = quantization_config.num_samples if quantization_config.num_samples else 200
quantized_model = nncf.quantize(
compressed_model,
dataset,
model=compressed_model,
calibration_dataset=nncf.Dataset(quantization_config.dataset),
model_type=nncf.ModelType.TRANSFORMER,
ignored_scope=nncf.IgnoredScope(**ptq_ignored_scope),
# The SQ algo should be disabled for MatMul nodes because their weights are already compressed
advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)),
subset_size=quantization_config.subset_size,
subset_size=subset_size,
)
return quantized_model
12 changes: 6 additions & 6 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
model_id = MODEL_NAMES[model_type]
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=5)
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2)
with tempfile.TemporaryDirectory() as tmp_dir:
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)

Expand All @@ -373,18 +373,18 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f

model.save_pretrained(tmp_dir)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[2:])
def test_ovmodel_hybrid_quantization_with_custom_dataset(
self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8
):
model_id = MODEL_NAMES[model_type]
dataset_name = "daspartho/stable-diffusion-prompts"
dataset = load_dataset(dataset_name, split="train", streaming=True)
quantization_dataset = nncf.Dataset(dataset, lambda x: x["prompt"])
dataset = [
"dream rose covered with clean crystal, sharp edges, transparent, beautiful, highly detailed, high render"
]
model = model_cls.from_pretrained(
model_id,
export=True,
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=quantization_dataset, subset_size=3),
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3),
)
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
Expand Down

0 comments on commit bf4080a

Please sign in to comment.