From f533af0fe17c1fe9681a8c794bc02f2b794d03ea Mon Sep 17 00:00:00 2001 From: Alexander Date: Fri, 29 Nov 2024 16:22:06 +0400 Subject: [PATCH] Set FP16 KV-cache for non-quantized text models --- optimum/exporters/openvino/__main__.py | 2 ++ optimum/exporters/openvino/convert.py | 13 +++++++++---- tests/openvino/test_export.py | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 3ac8314889..e4fe2a7a41 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -456,6 +456,8 @@ class StoreAttr(object): from optimum.intel.openvino.quantization import _weight_only_quantization _weight_only_quantization(submodel, quantization_config) + if "text-generation" in task: + submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"]) compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" save_model(submodel, compressed_submodel_path, compress_to_fp16=False) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index e4ece9801b..fb6ab08e34 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -99,11 +99,15 @@ def _set_runtime_options( ], task: str, library_name: str, + quantized_model: bool ): for model_name in models_and_export_configs.keys(): _, sub_export_config = models_and_export_configs[model_name] + sub_export_config.runtime_options = {} if "diffusers" in library_name or "text-generation" in task: - sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0" + if not quantized_model and "text-generation" in task: + sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16" def _save_model( @@ -116,8 +120,8 @@ def _save_model( compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16" model = _add_version_info_to_model(model, library_name) - if hasattr(config, "runtime_options"): - model = _add_runtime_options_to_rt_info(model, config.runtime_options) + runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {} + model = _add_runtime_options_to_rt_info(model, runtime_options) save_model(model, path, compress_to_fp16) del model gc.collect() @@ -755,7 +759,8 @@ def export_from_model( model.save_config(output) - _set_runtime_options(models_and_export_configs, task, library_name) + _set_runtime_options(models_and_export_configs, task, library_name, + hasattr(ov_config, "quantization_config") and ov_config.quantization_config) export_models( models_and_export_configs=models_and_export_configs, diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 80a45cab6e..2d57f92d0e 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -132,6 +132,7 @@ def _openvino_export( ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version ) self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])) + self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) if library_name == "diffusers": self.assertTrue(