From c93c2e7ea990152ce862e0ca3596d483610ced57 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Tue, 7 Jan 2025 13:47:56 +0100 Subject: [PATCH 1/9] Fp8 implementation --- docs/source/openvino/export.mdx | 9 ++++----- optimum/commands/export/openvino.py | 6 +----- optimum/intel/openvino/configuration.py | 20 +++++++++++--------- optimum/intel/openvino/quantization.py | 13 ++++++++----- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 12d82b3e29..83ab3315f5 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,fp8_e4m3,fp8_e5m2}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -67,10 +67,9 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8} + --quant-mode {int8,fp8_e4m3,fp8_e5m2} Quantization precision mode. This is used for applying full model quantization including - activations. The only currently supported choice is 'int8' for int8 quantization of both - weights and activations. + activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library @@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with -Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below. +Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Currently this is only supported for speech-to-text models. Please see example below. ```bash optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 7a0b7d7f3b..67510e0bc8 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8"], + choices=["int8", "fp8_e4m3", "fp8_e5m2"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " - "The only currently supported choice is 'int8' for int8 quantization of both weights and activations." ), ) optional_group.add_argument( @@ -365,9 +364,6 @@ def run(self): quantization_config["trust_remote_code"] = self.args.trust_remote_code ov_config = OVConfig(quantization_config=quantization_config) else: - if self.args.quant_mode != "int8": - raise ValueError("Only 'int8' quantization mode is currently supported.") - quantization_config = { "weight_format": self.args.quant_mode, "activation_format": self.args.quant_mode, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 5bf0337f38..777c96dbc9 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -638,9 +638,9 @@ def __init__( SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. weight_format (`str`, defaults to "int8"): - Data format weights are quantized to. Possible values: ['int8']. + Data format weights are quantized to. Possible values: ['int8', 'fp8_e4m3', 'fp8_e5m2']. activation_format (`str`, defaults to "int8"): - Data format activations are compressed to. Possible values: ['int8']. + Data format activations are compressed to. Possible values: ['int8', 'fp8_e4m3', 'fp8_e5m2']. """ super().__init__( bits=bits, @@ -681,11 +681,15 @@ def post_init(self): f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" ) - if self.weight_format != "int8": - raise ValueError("Only 'int8' weight format is currently supported.") - - if self.activation_format != "int8": - raise ValueError("Only 'int8' activation format is currently supported.") + if not self.sym: + if self.activation_format != "int8": + raise ValueError( + f"Asymmetric quantization can not be performed in {self.activation_format} activation format." + ) + if self.weight_format != "int8": + raise ValueError( + f"Asymmetric quantization can not be performed in {self.weight_format} weight format." + ) class OVConfig(BaseConfig): @@ -713,8 +717,6 @@ def __init__( if self.quantization_config is not None: if isinstance(self.quantization_config, OVWeightQuantizationConfig): self.dtype = self.quantization_config.weight_format - else: - self.dtype = "int8" else: self.dtype = dtype diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 962738e0e1..e8cfc5007f 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -458,11 +458,6 @@ def _quantize_ovbasemodel( if calibration_dataset is None: raise ValueError("Calibration dataset is required to run quantization.") - if quantization_config.weight_format != "int8": - raise ValueError("Only 'int8' weight format is currently supported.") - if quantization_config.activation_format != "int8": - raise ValueError("Only 'int8' activation format is currently supported.") - # Quantize model(s) if isinstance(self.model, _OVModelForWhisper): self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs) @@ -1071,6 +1066,14 @@ def _full_quantization( matmul=quantization_config.smooth_quant_alpha ) + q_mode_map = { + "fp8_e4m3": nncf.QuantizationMode.FP8_E4M3, + "fp8_e5m2": nncf.QuantizationMode.FP8_E5M2, + } + + if quantization_config.activation_format in q_mode_map: + kwargs.update({"mode": q_mode_map[quantization_config.activation_format]}) + quantized_model = nncf.quantize( model, calibration_dataset, From 44f11a7bd89474c55ddf6613838ba7b0976d0d4a Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Tue, 7 Jan 2025 16:34:15 +0100 Subject: [PATCH 2/9] All datasets support --- optimum/intel/openvino/configuration.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 777c96dbc9..fbaa450949 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -669,13 +669,6 @@ def post_init(self): if self.bits != 8: raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") - if self.dataset is not None: - if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS: - raise ValueError( - f"You have entered the following string value for dataset: {self.dataset}. But it is not supported." - f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}." - ) - if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1): raise ValueError( f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" From b54abf1953034c0f4ad0569e5b1ed9ae849c4122 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 12:12:01 +0100 Subject: [PATCH 3/9] Added test --- docs/source/openvino/export.mdx | 4 ++-- optimum/commands/export/openvino.py | 2 +- optimum/intel/openvino/configuration.py | 6 ++--- optimum/intel/openvino/quantization.py | 4 ++-- tests/openvino/test_exporters_cli.py | 20 ++++++++++++---- tests/openvino/utils_tests.py | 32 ++++++++++++------------- 6 files changed, 39 insertions(+), 29 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 83ab3315f5..3762febb6c 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,fp8_e4m3,fp8_e5m2}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -67,7 +67,7 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8,fp8_e4m3,fp8_e5m2} + --quant-mode {int8,f8e4m3,f8e5m2} Quantization precision mode. This is used for applying full model quantization including activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 67510e0bc8..20e2d7ca33 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8", "fp8_e4m3", "fp8_e5m2"], + choices=["int8", "f8e4m3", "f8e5m2"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index fbaa450949..cfe2c9c60e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -26,7 +26,7 @@ from optimum.configuration_utils import BaseConfig from ..utils.import_utils import is_nncf_available -from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS +from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS if is_nncf_available(): @@ -638,9 +638,9 @@ def __init__( SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. weight_format (`str`, defaults to "int8"): - Data format weights are quantized to. Possible values: ['int8', 'fp8_e4m3', 'fp8_e5m2']. + Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. activation_format (`str`, defaults to "int8"): - Data format activations are compressed to. Possible values: ['int8', 'fp8_e4m3', 'fp8_e5m2']. + Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ super().__init__( bits=bits, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index e8cfc5007f..5f780cd3a7 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1067,8 +1067,8 @@ def _full_quantization( ) q_mode_map = { - "fp8_e4m3": nncf.QuantizationMode.FP8_E4M3, - "fp8_e5m2": nncf.QuantizationMode.FP8_E5M2, + "f8e4m3": nncf.QuantizationMode.FP8_E4M3, + "f8e5m2": nncf.QuantizationMode.FP8_E5M2, } if quantization_config.activation_format in q_mode_map: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index f03b4fbc57..02541c2493 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -114,7 +114,16 @@ class OVCLIExportTestCase(unittest.TestCase): ( "automatic-speech-recognition", "whisper", - "--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", + "int8", + "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", + (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), + (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), + ), + ( + "automatic-speech-recognition", + "whisper", + "f8e4m3", + "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), ), @@ -407,13 +416,14 @@ def test_exporters_cli_full_quantization( self, task: str, model_type: str, + quant_mode: str, option: str, expected_num_fq_nodes_per_model: Tuple[int], expected_num_weight_nodes_per_model: Tuple[int], ): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}", shell=True, check=True, ) @@ -424,9 +434,9 @@ def test_exporters_cli_full_quantization( submodels = [model.encoder, model.decoder, model.decoder_with_past] self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels)) for i, model in enumerate(submodels): - actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes) - self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"]) + actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes) + self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode]) def test_exporters_cli_int4_with_local_model_and_default_config(self): with TemporaryDirectory() as tmpdir: diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 02b81bfdc5..eb8406e9ae 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -203,29 +203,29 @@ def get_num_quantized_nodes(model): num_fake_quantize = 0 - num_weight_nodes = { - "int8": 0, - "int4": 0, - "f4e2m1": 0, - "f8e8m0": 0, - "nf4": 0, + types_map = { + "i8": "int8", + "u8": "int8", + "i4": "int4", + "u4": "int4", + "f4e2m1": "f4e2m1", + "f8e8m0": "f8e8m0", + "nf4": "nf4", + "f8e4m3": "f8e4m3", + "f8e5m2": "f8e5m2", } + num_weight_nodes = {n: 0 for n in types_map.values()} ov_model = model if isinstance(model, ov.Model) else model.model for elem in ov_model.get_ops(): if "FakeQuantize" in elem.name: num_fake_quantize += 1 + elif "FakeConvert" in elem.name: + num_fake_quantize += 1 for i in range(elem.get_output_size()): type_name = elem.get_output_element_type(i).get_type_name() - if type_name in ["i8", "u8"]: - num_weight_nodes["int8"] += 1 - if type_name in ["i4", "u4"]: - num_weight_nodes["int4"] += 1 - if type_name == "f4e2m1": - num_weight_nodes["f4e2m1"] += 1 - if type_name == "f8e8m0": - num_weight_nodes["f8e8m0"] += 1 - if type_name == "nf4": - num_weight_nodes["nf4"] += 1 + if type_name in types_map: + name = types_map[type_name] + num_weight_nodes[name] += 1 return num_fake_quantize, num_weight_nodes From 6f5cd5bc079318cffbe42b1e1ae4b77c55368fe3 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 13:42:25 +0100 Subject: [PATCH 4/9] Update test --- tests/openvino/test_exporters_cli.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 02541c2493..310f9a596b 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -120,12 +120,12 @@ class OVCLIExportTestCase(unittest.TestCase): (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), ), ( - "automatic-speech-recognition", - "whisper", + "text-generation", + "phi3", "f8e4m3", - "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", - (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), - (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", + (13,), + (10,), ), ] @@ -429,11 +429,11 @@ def test_exporters_cli_full_quantization( ) model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir) - submodels = [] + models = [model] if task == "automatic-speech-recognition": - submodels = [model.encoder, model.decoder, model.decoder_with_past] - self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels)) - for i, model in enumerate(submodels): + models = [model.encoder, model.decoder, model.decoder_with_past] + self.assertEqual(len(expected_num_fq_nodes_per_model), len(models)) + for i, model in enumerate(models): actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes) self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode]) From ac7b57a6bf50f7fb9a4f486365eab0725cf6ab77 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 13:43:48 +0100 Subject: [PATCH 5/9] Correctness --- tests/openvino/utils_tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index eb8406e9ae..760c98bbb4 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -202,7 +202,7 @@ def get_num_quantized_nodes(model): - num_fake_quantize = 0 + num_fake_nodes = 0 types_map = { "i8": "int8", "u8": "int8", @@ -218,15 +218,15 @@ def get_num_quantized_nodes(model): ov_model = model if isinstance(model, ov.Model) else model.model for elem in ov_model.get_ops(): if "FakeQuantize" in elem.name: - num_fake_quantize += 1 + num_fake_nodes += 1 elif "FakeConvert" in elem.name: - num_fake_quantize += 1 + num_fake_nodes += 1 for i in range(elem.get_output_size()): type_name = elem.get_output_element_type(i).get_type_name() if type_name in types_map: name = types_map[type_name] num_weight_nodes[name] += 1 - return num_fake_quantize, num_weight_nodes + return num_fake_nodes, num_weight_nodes @contextmanager From 2df7fc4e65153fd97d93f8b0dc586faf77e7d96e Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 13:44:45 +0100 Subject: [PATCH 6/9] Correctness --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 760c98bbb4..ce6231fb5f 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -219,7 +219,7 @@ def get_num_quantized_nodes(model): for elem in ov_model.get_ops(): if "FakeQuantize" in elem.name: num_fake_nodes += 1 - elif "FakeConvert" in elem.name: + if "FakeConvert" in elem.name: num_fake_nodes += 1 for i in range(elem.get_output_size()): type_name = elem.get_output_element_type(i).get_type_name() From 710f50ac28b57a4287cd8a64759f76286e24e74d Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 16:09:21 +0100 Subject: [PATCH 7/9] Update docs/source/openvino/export.mdx Co-authored-by: Alexander Kozlov --- docs/source/openvino/export.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 3762febb6c..1d0c534193 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -165,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with -Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Currently this is only supported for speech-to-text models. Please see example below. +Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below. ```bash optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo From 3174ef02d5b9bcda27aff4ceaaec3ae392c1e018 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 8 Jan 2025 17:09:55 +0100 Subject: [PATCH 8/9] Change test model --- tests/openvino/test_exporters_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 310f9a596b..3d31e35943 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -121,11 +121,11 @@ class OVCLIExportTestCase(unittest.TestCase): ), ( "text-generation", - "phi3", + "llama", "f8e4m3", "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", (13,), - (10,), + (16,), ), ] From 0a8e3e77f3914176ce0bcab79652585dffb1c849 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Tue, 14 Jan 2025 10:12:02 +0100 Subject: [PATCH 9/9] Apply comments --- optimum/intel/openvino/configuration.py | 20 ++++++++------------ tests/openvino/test_exporters_cli.py | 8 ++++---- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index cfe2c9c60e..cb09110b61 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -658,6 +658,13 @@ def __init__( self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha self.activation_format = activation_format + + f8_formats = ["f8e4m3", "f8e5m2"] + if self.activation_format in f8_formats and self.weight_format in f8_formats: + logger.info( + f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used." + ) + self.sym = True self.post_init() def post_init(self): @@ -674,16 +681,6 @@ def post_init(self): f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" ) - if not self.sym: - if self.activation_format != "int8": - raise ValueError( - f"Asymmetric quantization can not be performed in {self.activation_format} activation format." - ) - if self.weight_format != "int8": - raise ValueError( - f"Asymmetric quantization can not be performed in {self.weight_format} weight format." - ) - class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" @@ -708,8 +705,7 @@ def __init__( "compression", None ) # A field for backward-compatability of training-time compression parameters if self.quantization_config is not None: - if isinstance(self.quantization_config, OVWeightQuantizationConfig): - self.dtype = self.quantization_config.weight_format + self.dtype = self.quantization_config.weight_format else: self.dtype = dtype diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 3d31e35943..840c6d4eb3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -123,7 +123,7 @@ class OVCLIExportTestCase(unittest.TestCase): "text-generation", "llama", "f8e4m3", - "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", (13,), (16,), ), @@ -418,7 +418,7 @@ def test_exporters_cli_full_quantization( model_type: str, quant_mode: str, option: str, - expected_num_fq_nodes_per_model: Tuple[int], + expected_num_f_nodes_per_model: Tuple[int], expected_num_weight_nodes_per_model: Tuple[int], ): with TemporaryDirectory() as tmpdir: @@ -432,10 +432,10 @@ def test_exporters_cli_full_quantization( models = [model] if task == "automatic-speech-recognition": models = [model.encoder, model.decoder, model.decoder_with_past] - self.assertEqual(len(expected_num_fq_nodes_per_model), len(models)) + self.assertEqual(len(expected_num_f_nodes_per_model), len(models)) for i, model in enumerate(models): actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes) + self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes) self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode]) def test_exporters_cli_int4_with_local_model_and_default_config(self):