Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to NNCF 2.11 #763

Merged
merged 9 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/optimization_ov.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ For 4-bit quantization you can also specify the following arguments in the quant

Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency.

Quality of 4-bit weight compressed model can further be improved by employing one of the following data-dependent methods:
* AWQ which stands for Activation Aware Quantization is an algorithm that tunes model weights for more accurate 4-bit compression. It slightly improves generation quality of compressed LLMs, but requires significant additional time and memory for tuning weights on a calibration dataset. Please note that it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped.
* Scale Estimation is a method that tunes quantization scales to minimize the `L2` error between the original and compressed layers. Providing a dataset is required to run scale estimation. Using this method also incurs additional time and memory overhead.

AWQ and Scale Estimation algorithms can be applied together or separately. For that, provide corresponding arguments to the 4-bit `OVWeightQuantizationConfig` together with a dataset. For example:
```python
quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", scale_estimation=True, dataset="wikitext2")
```

### Static quantization

When applying post-training static quantization, both the weights and the activations are quantized.
Expand Down
11 changes: 11 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
"case it will be skipped."
),
)
optional_group.add_argument(
"--scale-estimation",
action="store_true",
default=None,
help=(
"Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original "
"and compressed layers. Providing a dataset is required to run scale estimation. Please note, that "
"applying scale estimation takes additional memory and time."
),
)
optional_group.add_argument(
"--sensitivity-metric",
type=str,
Expand Down Expand Up @@ -255,6 +265,7 @@ def run(self):
"num_samples": self.args.num_samples,
"quant_method": QuantizationMethod.AWQ if self.args.awq else None,
"sensitivity_metric": self.args.sensitivity_metric,
"scale_estimation": self.args.scale_estimation,
}

if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
Expand Down
16 changes: 15 additions & 1 deletion optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,19 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
num_samples (`int`, *optional*):
The maximum number of samples composing the calibration dataset.
quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
Weight compression method to apply.
Weight compression method to apply. Possible options:
- "default": default weight quantization will be applied.
- "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
method. AWQ improves generation quality of INT4-compressed LLMs, but requires
additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is
required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in
such case it will be skipped.
- "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and
activations of other layers, facilitating accuracy preservation post-optimization while reducing
the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines.
scale_estimation (`bool`, *optional*):
Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
compressed layers. Providing a dataset is required to run scale estimation.
"""

def __init__(
Expand All @@ -188,6 +200,7 @@ def __init__(
ignored_scope: Optional[dict] = None,
num_samples: Optional[int] = None,
quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
scale_estimation: bool = None,
**kwargs,
):
super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
Expand All @@ -198,6 +211,7 @@ def __init__(
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.quant_method = quant_method
self.scale_estimation = scale_estimation
self.post_init()

def post_init(self):
Expand Down
1 change: 1 addition & 0 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,7 @@ def _weight_only_quantization(
ignored_scope=config.get_ignored_scope_instance(),
dataset=dataset,
subset_size=config.num_samples if config.num_samples else 128,
scale_estimation=config.scale_estimation,
)


Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@

EXTRAS_REQUIRE = {
"neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
"openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.10.0"],
"openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.11.0"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"],
"diffusers": ["diffusers"],
"quality": QUALITY_REQUIRE,
Expand Down
8 changes: 8 additions & 0 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ class OVCLIExportTestCase(unittest.TestCase):
4,
28,
),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
4,
28,
),
]

def _openvino_export(
Expand Down Expand Up @@ -218,6 +225,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
self.assertEqual(expected_int8, num_int8)
self.assertEqual(expected_int4, num_int4)
self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)

def test_exporters_cli_help(self):
subprocess.run(
Expand Down
6 changes: 4 additions & 2 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ class OVWeightCompressionTest(unittest.TestCase):
sensitivity_metric="mean_activation_magnitude",
dataset="ptb",
quant_method=QuantizationMethod.AWQ,
scale_estimation=True,
),
16,
),
Expand Down Expand Up @@ -456,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config(
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
if quantization_config.quant_method == QuantizationMethod.AWQ:
# TODO: Check that AWQ was actually applied
if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation:
# TODO: Check that AWQ and SE was actually applied
pass

tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down Expand Up @@ -558,6 +559,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self):
"ignored_scope": nncf.IgnoredScope(),
"awq": None,
"subset_size": 128,
"scale_estimation": None,
}
compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)

Expand Down
Loading