diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 12d82b3e29..1d0c534193 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}] @@ -67,10 +67,9 @@ Optional arguments: on your local machine arbitrary code present in the model repository. --weight-format {fp32,fp16,int8,int4,mxfp4,nf4} The weight format of the exported model. - --quant-mode {int8} + --quant-mode {int8,f8e4m3,f8e5m2} Quantization precision mode. This is used for applying full model quantization including - activations. The only currently supported choice is 'int8' for int8 quantization of both - weights and activations. + activations. --library {transformers,diffusers,timm,sentence_transformers,open_clip} The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library @@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with -Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below. +Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below. ```bash optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 7a0b7d7f3b..20e2d7ca33 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--quant-mode", type=str, - choices=["int8"], + choices=["int8", "f8e4m3", "f8e5m2"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " - "The only currently supported choice is 'int8' for int8 quantization of both weights and activations." ), ) optional_group.add_argument( @@ -365,9 +364,6 @@ def run(self): quantization_config["trust_remote_code"] = self.args.trust_remote_code ov_config = OVConfig(quantization_config=quantization_config) else: - if self.args.quant_mode != "int8": - raise ValueError("Only 'int8' quantization mode is currently supported.") - quantization_config = { "weight_format": self.args.quant_mode, "activation_format": self.args.quant_mode, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 5bf0337f38..cb09110b61 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -26,7 +26,7 @@ from optimum.configuration_utils import BaseConfig from ..utils.import_utils import is_nncf_available -from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS +from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS if is_nncf_available(): @@ -638,9 +638,9 @@ def __init__( SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and reduces quantization error. weight_format (`str`, defaults to "int8"): - Data format weights are quantized to. Possible values: ['int8']. + Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. activation_format (`str`, defaults to "int8"): - Data format activations are compressed to. Possible values: ['int8']. + Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. """ super().__init__( bits=bits, @@ -658,6 +658,13 @@ def __init__( self.overflow_fix = overflow_fix self.smooth_quant_alpha = smooth_quant_alpha self.activation_format = activation_format + + f8_formats = ["f8e4m3", "f8e5m2"] + if self.activation_format in f8_formats and self.weight_format in f8_formats: + logger.info( + f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used." + ) + self.sym = True self.post_init() def post_init(self): @@ -669,24 +676,11 @@ def post_init(self): if self.bits != 8: raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") - if self.dataset is not None: - if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS: - raise ValueError( - f"You have entered the following string value for dataset: {self.dataset}. But it is not supported." - f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}." - ) - if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1): raise ValueError( f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" ) - if self.weight_format != "int8": - raise ValueError("Only 'int8' weight format is currently supported.") - - if self.activation_format != "int8": - raise ValueError("Only 'int8' activation format is currently supported.") - class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" @@ -711,10 +705,7 @@ def __init__( "compression", None ) # A field for backward-compatability of training-time compression parameters if self.quantization_config is not None: - if isinstance(self.quantization_config, OVWeightQuantizationConfig): - self.dtype = self.quantization_config.weight_format - else: - self.dtype = "int8" + self.dtype = self.quantization_config.weight_format else: self.dtype = dtype diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 38b96b209d..f61c2b93ca 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -458,11 +458,6 @@ def _quantize_ovbasemodel( if calibration_dataset is None: raise ValueError("Calibration dataset is required to run quantization.") - if quantization_config.weight_format != "int8": - raise ValueError("Only 'int8' weight format is currently supported.") - if quantization_config.activation_format != "int8": - raise ValueError("Only 'int8' activation format is currently supported.") - # Quantize model(s) if isinstance(self.model, _OVModelForWhisper): self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs) @@ -1077,6 +1072,14 @@ def _full_quantization( matmul=quantization_config.smooth_quant_alpha ) + q_mode_map = { + "f8e4m3": nncf.QuantizationMode.FP8_E4M3, + "f8e5m2": nncf.QuantizationMode.FP8_E5M2, + } + + if quantization_config.activation_format in q_mode_map: + kwargs.update({"mode": q_mode_map[quantization_config.activation_format]}) + quantized_model = nncf.quantize( model, calibration_dataset, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index f189f48a1b..f4b96ec998 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -118,10 +118,19 @@ class OVCLIExportTestCase(unittest.TestCase): ( "automatic-speech-recognition", "whisper", - "--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", + "int8", + "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), ), + ( + "text-generation", + "llama", + "f8e4m3", + "--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", + (13,), + (16,), + ), ] TEST_4BIT_CONFIGURATIONS = [ @@ -411,30 +420,31 @@ def test_exporters_cli_full_quantization( self, task: str, model_type: str, + quant_mode: str, option: str, - expected_num_fq_nodes_per_model: Tuple[int], + expected_num_f_nodes_per_model: Tuple[int], expected_num_weight_nodes_per_model: Tuple[int], ): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}", shell=True, check=True, ) model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir) - submodels = [] + models = [model] if task == "automatic-speech-recognition": - submodels = [model.encoder, model.decoder] + models = [model.encoder, model.decoder] if model.decoder_with_past is not None: - submodels.append(model.decoder_with_past) + models.append(model.decoder_with_past) else: - expected_num_fq_nodes_per_model = expected_num_fq_nodes_per_model[:-1] - self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels)) - for i, model in enumerate(submodels): - actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) - self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes) - self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"]) + expected_num_f_nodes_per_model = expected_num_f_nodes_per_model[:-1] + self.assertEqual(len(expected_num_f_nodes_per_model), len(models)) + for i, model in enumerate(models): + actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes) + self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode]) def test_exporters_cli_int4_with_local_model_and_default_config(self): with TemporaryDirectory() as tmpdir: diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 59fe660d52..3100df6159 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -206,31 +206,31 @@ def get_num_quantized_nodes(model): - num_fake_quantize = 0 - num_weight_nodes = { - "int8": 0, - "int4": 0, - "f4e2m1": 0, - "f8e8m0": 0, - "nf4": 0, + num_fake_nodes = 0 + types_map = { + "i8": "int8", + "u8": "int8", + "i4": "int4", + "u4": "int4", + "f4e2m1": "f4e2m1", + "f8e8m0": "f8e8m0", + "nf4": "nf4", + "f8e4m3": "f8e4m3", + "f8e5m2": "f8e5m2", } + num_weight_nodes = {n: 0 for n in types_map.values()} ov_model = model if isinstance(model, ov.Model) else model.model for elem in ov_model.get_ops(): if "FakeQuantize" in elem.name: - num_fake_quantize += 1 + num_fake_nodes += 1 + if "FakeConvert" in elem.name: + num_fake_nodes += 1 for i in range(elem.get_output_size()): type_name = elem.get_output_element_type(i).get_type_name() - if type_name in ["i8", "u8"]: - num_weight_nodes["int8"] += 1 - if type_name in ["i4", "u4"]: - num_weight_nodes["int4"] += 1 - if type_name == "f4e2m1": - num_weight_nodes["f4e2m1"] += 1 - if type_name == "f8e8m0": - num_weight_nodes["f8e8m0"] += 1 - if type_name == "nf4": - num_weight_nodes["nf4"] += 1 - return num_fake_quantize, num_weight_nodes + if type_name in types_map: + name = types_map[type_name] + num_weight_nodes[name] += 1 + return num_fake_nodes, num_weight_nodes @contextmanager