Skip to content

Commit

Permalink
Apply comments; some additional tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Jun 17, 2024
1 parent 6c3daef commit b796fd0
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 16 deletions.
2 changes: 1 addition & 1 deletion optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def run(self):
"all_layers": None if is_int8 else self.args.all_layers,
"dataset": self.args.dataset,
"num_samples": self.args.num_samples,
"awq": self.args.awq,
"quant_method": QuantizationMethod.AWQ if self.args.awq else None,
"sensitivity_metric": self.args.sensitivity_metric,
"scale_estimation": self.args.scale_estimation,
}
Expand Down
24 changes: 13 additions & 11 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,18 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
num_samples (`int`, *optional*):
The maximum number of samples composing the calibration dataset.
quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
Weight compression method to apply.
Weight compression method to apply. Possible options:
- "default": default weight quantization will be applied.
- "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
method. AWQ improves generation quality of INT4-compressed LLMs, but requires
additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is
required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in
such case it will be skipped.
- "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and
activations of other layers, facilitating accuracy preservation post-optimization while reducing
the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines.
awq (`bool`, *optional*):
Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires
additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is required.
Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such case it
will be skipped.
Alias for `quant_method="awq"`.
scale_estimation (`bool`, *optional*):
Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
compressed layers. Providing a dataset is required to run scale estimation.
Expand Down Expand Up @@ -208,8 +214,9 @@ def __init__(
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.quant_method = quant_method
self.awq = awq
self.scale_estimation = scale_estimation
if awq:
self.quant_method = QuantizationMethod.AWQ
self.post_init()

def post_init(self):
Expand Down Expand Up @@ -255,11 +262,6 @@ def post_init(self):
if self.tokenizer is not None and not isinstance(self.tokenizer, str):
raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")

if self.quant_method == QuantizationMethod.AWQ:
self.quant_method = OVQuantizationMethod.DEFAULT
self.awq = True
logger.warning('Using quant_method="AWQ" is deprecated. Please use awq=True instead in the future.')


@dataclass
class OVDynamicQuantizationConfig(OVWeightQuantizationConfig):
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ def _weight_only_quantization(
group_size=config.group_size,
all_layers=config.all_layers,
sensitivity_metric=sensitivity_metric,
awq=config.awq,
awq=config.quant_method == QuantizationMethod.AWQ or None,
ignored_scope=config.get_ignored_scope_instance(),
dataset=dataset,
subset_size=config.num_samples if config.num_samples else 128,
Expand Down
6 changes: 3 additions & 3 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ class OVWeightCompressionTest(unittest.TestCase):
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
dataset="ptb",
awq=True,
quant_method=QuantizationMethod.AWQ,
scale_estimation=True,
),
16,
Expand Down Expand Up @@ -457,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config(
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
if quantization_config.awq:
# TODO: Check that AWQ and SE were actually applied
if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation:
# TODO: Check that AWQ and SE was actually applied
pass

tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down

0 comments on commit b796fd0

Please sign in to comment.