Skip to content

Commit

Permalink
Set FP16 KV-cache for non-quantized text models (#1043)
Browse files Browse the repository at this point in the history
* Set FP16 KV-cache for non-quantized text models

* Style
  • Loading branch information
AlexKoff88 authored Dec 4, 2024
1 parent ba45714 commit c94b3f5
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 4 deletions.
2 changes: 2 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,8 @@ class StoreAttr(object):
from optimum.intel.openvino.quantization import _weight_only_quantization

_weight_only_quantization(submodel, quantization_config)
if "text-generation" in task:
submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])

compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
Expand Down
17 changes: 13 additions & 4 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,15 @@ def _set_runtime_options(
],
task: str,
library_name: str,
quantized_model: bool,
):
for model_name in models_and_export_configs.keys():
_, sub_export_config = models_and_export_configs[model_name]
sub_export_config.runtime_options = {}
if "diffusers" in library_name or "text-generation" in task:
sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
if not quantized_model and "text-generation" in task:
sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"


def _save_model(
Expand All @@ -116,8 +120,8 @@ def _save_model(
compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
model = _add_version_info_to_model(model, library_name)

if hasattr(config, "runtime_options"):
model = _add_runtime_options_to_rt_info(model, config.runtime_options)
runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {}
model = _add_runtime_options_to_rt_info(model, runtime_options)
save_model(model, path, compress_to_fp16)
del model
gc.collect()
Expand Down Expand Up @@ -755,7 +759,12 @@ def export_from_model(

model.save_config(output)

_set_runtime_options(models_and_export_configs, task, library_name)
_set_runtime_options(
models_and_export_configs,
task,
library_name,
hasattr(ov_config, "quantization_config") and ov_config.quantization_config,
)

export_models(
models_and_export_configs=models_and_export_configs,
Expand Down
1 change: 1 addition & 0 deletions tests/openvino/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def _openvino_export(
ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
)
self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]))
self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))

if library_name == "diffusers":
self.assertTrue(
Expand Down

0 comments on commit c94b3f5

Please sign in to comment.