diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 8cffd06121..4e0d28f846 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -70,7 +70,7 @@ Optional arguments: --pad-token-id PAD_TOKEN_ID This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it. --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while - 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8. + 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE The group size to use for int4 quantization. Recommended value is 128 and -1 will results in per-column quantization. diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index ee1f62388f..2bdee32e17 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -102,7 +102,7 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " - "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8." + "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0." ), ) optional_group.add_argument( @@ -277,7 +277,7 @@ def _get_default_int4_config(model_id_or_path, library_name): else: quantization_config = { "bits": 8 if is_int8 else 4, - "ratio": 1 if is_int8 else (self.args.ratio or 0.8), + "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, "all_layers": None if is_int8 else self.args.all_layers,