Skip to content

Commit

Permalink
[OV] Fix data-free VLM compression via optimum-cli (#1058)
Browse files Browse the repository at this point in the history
* Fix vlm compression

* Extend compression tests to check submodel weights precision

* Update references

* Fix condition

* Export in auto dtype if possible

* Reformat condition
  • Loading branch information
nikita-savelyevv authored Dec 19, 2024
1 parent 9efb8e3 commit b17d1e0
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 59 deletions.
6 changes: 4 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,9 @@ def run(self):
model.save_pretrained(self.args.output)
if not self.args.disable_convert_tokenizer:
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
elif (task.startswith("text-generation") and quantize_with_dataset) or (
task == "image-text-to-text" and quantization_config is not None
):
if task.startswith("text-generation"):
from optimum.intel import OVModelForCausalLM

Expand All @@ -378,7 +380,7 @@ def run(self):

model_cls = OVModelForVisualCausalLM

# To quantize a model with a dataset, an instance of a model class is required
# In this case, to apply quantization an instance of a model class is required
model = model_cls.from_pretrained(
self.args.model,
export=True,
Expand Down
3 changes: 2 additions & 1 deletion optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,8 @@ def _from_transformers(
if load_in_8bit is None and not quantization_config:
ov_config = None
else:
ov_config = OVConfig(dtype="fp32")
# Export in fp32 if compression won't be applied later
ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")

stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)

Expand Down
101 changes: 78 additions & 23 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
import subprocess
import unittest
from pathlib import Path
from typing import Dict, List

from parameterized import parameterized
from transformers import AutoModelForCausalLM
from utils_tests import (
_ARCHITECTURES_TO_EXPECTED_INT8,
MODEL_NAMES,
compare_num_quantized_nodes_per_model,
get_num_quantized_nodes,
)

Expand Down Expand Up @@ -108,37 +110,47 @@ class OVCLIExportTestCase(unittest.TestCase):
SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56))

TEST_4BIT_CONFIGURATIONS = [
("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}),
("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", [{"int8": 4, "int4": 72}]),
("text-generation-with-past", "opt125m", "int4 --group-size 64", [{"int8": 4, "int4": 144}]),
("text-generation-with-past", "opt125m", "mxfp4", [{"int8": 4, "f4e2m1": 72, "f8e8m0": 72}]),
("text-generation-with-past", "opt125m", "nf4", [{"int8": 4, "nf4": 72}]),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 8 --all-layers",
[{"int4": 16}],
),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
"--sensitivity-metric max_activation_variance",
{"int8": 4, "int4": 14},
[{"int8": 4, "int4": 14}],
),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
{"int8": 4, "int4": 14},
[{"int8": 4, "int4": 14}],
),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --gptq --dataset wikitext2 --num-samples 100 ",
{"int8": 4, "int4": 14},
[{"int8": 4, "int4": 14}],
),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --lora-correction --dataset auto --num-samples 16",
{"int8": 60, "int4": 14},
[{"int8": 60, "int4": 14}],
),
(
"text-generation-with-past",
"llama_awq",
"int4 --group-size 16 --backup-precision none --ratio 0.5",
[{"int4": 6}],
),
("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}),
]

if is_transformers_version(">=", "4.40.0"):
Expand All @@ -147,36 +159,73 @@ class OVCLIExportTestCase(unittest.TestCase):
(
"image-text-to-text",
"llava_next",
'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
"int4 --group-size 16 --ratio 0.8",
[{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
),
(
"image-text-to-text",
"llava_next",
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
"--dataset contextual --num-samples 1",
{"int8": 8, "int4": 22},
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
),
(
"image-text-to-text",
"nanollava",
"int4 --group-size 8 --ratio 0.8 --trust-remote-code",
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
),
(
"image-text-to-text",
"nanollava",
'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" '
'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
"--dataset contextual --num-samples 1 --trust-remote-code",
{"int8": 12, "int4": 18},
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
),
]
)

if is_transformers_version(">=", "4.45.0"):
TEST_4BIT_CONFIGURATIONS.extend(
[
(
"image-text-to-text",
"minicpmv",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
),
(
"image-text-to-text",
"minicpmv",
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
),
(
"image-text-to-text",
"internvl2",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
),
(
"image-text-to-text",
"internvl2",
'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" '
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
{"int8": 6, "int4": 24},
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
),
(
"image-text-to-text",
"phi3_v",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
),
(
"image-text-to-text",
"phi3_v",
'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
{"int8": 4, "int4": 14},
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
),
]
)
Expand Down Expand Up @@ -300,7 +349,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
self.assertEqual(exp_num_fq, num_fq)

@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
def test_exporters_cli_4bit(
self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: List[Dict]
):
with TemporaryDirectory() as tmpdir:
result = subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
Expand All @@ -317,11 +368,15 @@ def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expec
else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
).from_pretrained(tmpdir, **model_kwargs)

ov_model = model.lm_model if task == "image-text-to-text" else model.model
submodels = []
if task == "text-generation-with-past":
submodels = [model]
elif task == "image-text-to-text":
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
submodels += [getattr(model, part) for part in model.additional_parts]

compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)

_, num_weight_nodes = get_num_quantized_nodes(ov_model)
expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
self.assertTrue("--gptq" not in option or b"Applying GPTQ" in result.stdout)
Expand Down
Loading

0 comments on commit b17d1e0

Please sign in to comment.