From c94b3f5efaedb8c83cb15f8e0cede6f523631bde Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Wed, 4 Dec 2024 08:56:47 +0300 Subject: [PATCH] Set FP16 KV-cache for non-quantized text models (#1043) * Set FP16 KV-cache for non-quantized text models * Style --- optimum/exporters/openvino/__main__.py | 2 ++ optimum/exporters/openvino/convert.py | 17 +++++++++++++---- tests/openvino/test_export.py | 1 + 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 3ac8314889..e4fe2a7a41 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -456,6 +456,8 @@ class StoreAttr(object): from optimum.intel.openvino.quantization import _weight_only_quantization _weight_only_quantization(submodel, quantization_config) + if "text-generation" in task: + submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"]) compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" save_model(submodel, compressed_submodel_path, compress_to_fp16=False) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index e4ece9801b..6012e6cfb5 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -99,11 +99,15 @@ def _set_runtime_options( ], task: str, library_name: str, + quantized_model: bool, ): for model_name in models_and_export_configs.keys(): _, sub_export_config = models_and_export_configs[model_name] + sub_export_config.runtime_options = {} if "diffusers" in library_name or "text-generation" in task: - sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0" + if not quantized_model and "text-generation" in task: + sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16" def _save_model( @@ -116,8 +120,8 @@ def _save_model( compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16" model = _add_version_info_to_model(model, library_name) - if hasattr(config, "runtime_options"): - model = _add_runtime_options_to_rt_info(model, config.runtime_options) + runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {} + model = _add_runtime_options_to_rt_info(model, runtime_options) save_model(model, path, compress_to_fp16) del model gc.collect() @@ -755,7 +759,12 @@ def export_from_model( model.save_config(output) - _set_runtime_options(models_and_export_configs, task, library_name) + _set_runtime_options( + models_and_export_configs, + task, + library_name, + hasattr(ov_config, "quantization_config") and ov_config.quantization_config, + ) export_models( models_and_export_configs=models_and_export_configs, diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 80a45cab6e..2d57f92d0e 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -132,6 +132,7 @@ def _openvino_export( ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version ) self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])) + self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) if library_name == "diffusers": self.assertTrue(