From 09126af02382b92c017f8a8bba6c99e1c608b623 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 3 Jan 2025 16:02:37 +0100 Subject: [PATCH 01/46] add torch sample --- examples/llm_compression/torch/main.py | 74 +++++++++++++++++++ .../weight_compression/algorithm.py | 2 - .../weight_compression/mixed_precision.py | 17 ++++- .../weight_compression/openvino_backend.py | 2 + .../weight_compression/scale_estimation.py | 10 ++- .../weight_compression/torch_backend.py | 40 ++++++++++ .../weight_compression/torch_fx_backend.py | 40 ++++++++++ nncf/quantization/quantize_model.py | 8 -- 8 files changed, 178 insertions(+), 15 deletions(-) create mode 100644 examples/llm_compression/torch/main.py diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py new file mode 100644 index 00000000000..88449b42281 --- /dev/null +++ b/examples/llm_compression/torch/main.py @@ -0,0 +1,74 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import torch +from datasets import load_dataset +from optimum.exporters.openvino.convert import export_from_model +from optimum.intel.openvino import OVModelForCausalLM +from transformers import AutoModelForCausalLM +from transformers import AutoTokenizer + +import nncf + +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" +OUTPUT_DIR = "tinyllama_compressed" +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def quantize(model, dataset): + quantization_dataset = nncf.Dataset(dataset) + compressed_model = nncf.compress_weights( + model, + dataset=quantization_dataset, + mode=nncf.CompressWeightsMode.INT4_SYM, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + compressed_model.to("cpu") # issue with cuda export + export_from_model(compressed_model, OUTPUT_DIR, stateful=False, compression_option="fp32", device="cpu") + + +def validate(tokenizer, model): + input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device) + + start_t = time.time() + output = model.generate(**input_ids, max_new_tokens=100) + print("Elapsed time: ", time.time() - start_t) + + output_text = tokenizer.decode(output[0]) + print(output_text) + return output_text + + +def main(): + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device) + model.eval() + + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + dataset = dataset.filter(lambda example: len(example["text"]) > 128) # necessary - why?? + + def transform_fn(data): + tokenized_text = tokenizer(data["text"], return_tensors="pt") + return tokenized_text.data # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT) + + dataset = dataset.map(transform_fn).with_format("torch", device=device) + + quantize(model, dataset) + compressed_model = OVModelForCausalLM.from_pretrained( + OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"} + ) + validate(tokenizer, compressed_model) + + +if __name__ == "__main__": + main() diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index c5a4e2d221c..14d0c800d32 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -702,8 +702,6 @@ def get_matmul_input_to_output_nodes_map( """ matmul_input_to_output_nodes_map = defaultdict(list) for node in matmul_nodes: - if node.layer_attributes.input_attributes["transpose"]: # It works only for OV - raise nncf.UnsupportedModelError("Transposed input is not supported") act_node, output_port_id = self._get_activation_node_and_port(node, graph) matmul_input_to_output_nodes_map[(act_node, output_port_id)].append(node) return matmul_input_to_output_nodes_map diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py index f2fbae67a5c..900b639feeb 100644 --- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -211,7 +211,7 @@ class DataBasedCriterion(DataFreeCriterion, ABC): @property def available_backends(self) -> List[BackendType]: - return [BackendType.OPENVINO] + return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX] def _set_backend_entity(self, model: TModel) -> None: model_backend = get_backend(model) @@ -219,6 +219,14 @@ def _set_backend_entity(self, model: TModel) -> None: from nncf.quantization.algorithms.weight_compression.openvino_backend import OVMixedPrecisionAlgoBackend self._backend_entity = OVMixedPrecisionAlgoBackend(model) + elif model_backend == BackendType.TORCH: + from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend + + self._backend_entity = PTMixedPrecisionAlgoBackend() + elif model_backend == BackendType.TORCH_FX: + from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXMixedPrecisionAlgoBackend + + self._backend_entity = FXMixedPrecisionAlgoBackend() else: raise nncf.UnsupportedBackendError( "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value) @@ -303,7 +311,7 @@ def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) - def _get_statistics_for_node( self, statistic_points: StatisticPointsContainer, node: NNCFNode, nncf_graph: NNCFGraph, stat_key: str ) -> List[Tensor]: - act_node, output_port_id = self._get_activation_node_and_port(node, nncf_graph) + act_node, _ = self._get_activation_node_and_port(node, nncf_graph) def input_filter_func(point): # For the floating-point statistics collected in POST_LAYER style, @@ -311,8 +319,9 @@ def input_filter_func(point): # For the cases when the layer has more than one (0) output port. return ( self._algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == TargetType.POST_LAYER_OPERATION - and point.target_point.port_id == output_port_id + and point.target_point.type in [TargetType.POST_LAYER_OPERATION, TargetType.OPERATOR_POST_HOOK] + # and point.target_point.port_id == output_port_id + # Add a unique filter func for backend?? ) stats = [] diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index ec4dfab4711..2917c4ada3d 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -105,6 +105,8 @@ def mean_statistic_collector( @staticmethod def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int: + if node.layer_attributes.input_attributes["transpose"]: # It works only for OV + raise nncf.UnsupportedModelError("Transposed input is not supported") constant_ports = node.layer_attributes.get_const_port_ids() activation_ports = [ e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index a5572530857..90ff789a429 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -84,7 +84,7 @@ def __init__( @property def available_backends(self) -> List[BackendType]: - return [BackendType.OPENVINO] + return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX] def _set_backend_entity(self, model: TModel) -> None: """ @@ -101,6 +101,14 @@ def _set_backend_entity(self, model: TModel) -> None: from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping) + if model_backend == BackendType.TORCH: + from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend + + self._backend_entity = PTWeightCompressionAlgoBackend(model, self.name_to_node_mapping) + if model_backend == BackendType.TORCH_FX: + from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend + + self._backend_entity = FXWeightCompressionAlgoBackend(model, self.name_to_node_mapping) else: raise nncf.UnsupportedBackendError( "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 136c38413ab..3c5914a41cb 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -21,12 +21,20 @@ from nncf.common.graph.operator_metatypes import OperatorMetatype from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.transformations.layout import TransformationLayout +from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator from nncf.experimental.common.tensor_statistics.collectors import MeanReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer from nncf.experimental.common.tensor_statistics.collectors import TensorCollector +from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -291,3 +299,35 @@ def transform_model( transformed_model = PTModelTransformer(model).transform(transformation_layout) return transformed_model + + +class PTMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, PTWeightCompressionAlgoBackend): + @staticmethod + def mean_variance_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MeanVarianceReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MeanVarianceTensorStatistic) + collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator) + return collector + + @staticmethod + def max_variance_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MaxVarianceReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MaxVarianceTensorStatistic) + collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator) + return collector + + @staticmethod + def mean_abs_max_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MeanAbsMaxReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MeanMagnitudeTensorStatistic) + collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) + return collector diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index c7c0a685244..794bc4a6427 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -22,10 +22,17 @@ from nncf.common.graph.operator_metatypes import OperatorMetatype from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.transformations.layout import TransformationLayout +from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator from nncf.experimental.common.tensor_statistics.collectors import MeanReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer from nncf.experimental.common.tensor_statistics.collectors import TensorCollector +from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand from nncf.experimental.torch.fx.model_transformer import FXModelTransformer @@ -34,6 +41,7 @@ from nncf.experimental.torch.fx.transformations import constant_update_transformation_builder from nncf.experimental.torch.fx.transformations import module_insertion_transformation_builder from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -251,3 +259,35 @@ def transform_model( transformed_model = FXModelTransformer(model).transform(transformation_layout) return transformed_model + + +class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompressionAlgoBackend): + @staticmethod + def mean_variance_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MeanVarianceReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MeanVarianceTensorStatistic) + collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator) + return collector + + @staticmethod + def max_variance_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MaxVarianceReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MaxVarianceTensorStatistic) + collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator) + return collector + + @staticmethod + def mean_abs_max_statistic_collector( + reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + reducer = MeanAbsMaxReducer(reduction_axes, inplace=True) + aggregator = MeanAggregator(num_samples=subset_size) + collector = TensorCollector(MeanMagnitudeTensorStatistic) + collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) + return collector diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 4c85ed42aaf..c8eb25d50e4 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -514,7 +514,6 @@ def compress_weights( options = { "awq": awq, - "scale_estimation": scale_estimation, "gptq": gptq, "lora_correction": lora_correction, } @@ -524,12 +523,6 @@ def compress_weights( f"Torch backend does not support {', '.join(unsupported_options)} option(s). Set them to None." ) - if sensitivity_metric not in [None, SensitivityMetric.WEIGHT_QUANTIZATION_ERROR]: - raise nncf.ParameterNotSupportedError( - "Torch backend only supports data-free sensitivity metric. " - "Set None or SensitivityMetric.WEIGHT_QUANTIZATION_ERROR." - ) - if advanced_parameters and advanced_parameters.statistics_path: raise nncf.ParameterNotSupportedError("Torch does not support statistics caching.") @@ -546,7 +539,6 @@ def compress_weights( else: example_input = next(iter(dataset.get_inference_data())) model = wrap_model(model, example_input=example_input, trace_parameters=True) - dataset = None compression_weights_impl = pt_compression_weights_impl if backend == BackendType.TORCH_FX: From 67cef71d22b3e06148b65a8de91374d646a16975 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 7 Jan 2025 13:33:53 +0100 Subject: [PATCH 02/46] upd sample --- examples/llm_compression/torch/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py index 88449b42281..0a213dadcbd 100644 --- a/examples/llm_compression/torch/main.py +++ b/examples/llm_compression/torch/main.py @@ -19,12 +19,12 @@ import nncf -MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" +MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3" OUTPUT_DIR = "tinyllama_compressed" device = "cuda" if torch.cuda.is_available() else "cpu" -def quantize(model, dataset): +def quantize(model, tokenizer, dataset): quantization_dataset = nncf.Dataset(dataset) compressed_model = nncf.compress_weights( model, @@ -51,23 +51,23 @@ def validate(tokenizer, model): def main(): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device) + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, load_in_8bit=False).to(device) model.eval() dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - dataset = dataset.filter(lambda example: len(example["text"]) > 128) # necessary - why?? + # dataset = dataset.filter(lambda example: len(example["text"]) > 128) # THIS LEADS TO A WORSE RESULT ON VALIDATION def transform_fn(data): - tokenized_text = tokenizer(data["text"], return_tensors="pt") + tokenized_text = tokenizer(data["text"], return_tensors="pt").to(device) return tokenized_text.data # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT) dataset = dataset.map(transform_fn).with_format("torch", device=device) - quantize(model, dataset) - compressed_model = OVModelForCausalLM.from_pretrained( + quantize(model, tokenizer, dataset) + model = OVModelForCausalLM.from_pretrained( OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"} ) - validate(tokenizer, compressed_model) + validate(tokenizer, model) if __name__ == "__main__": From 94d28503e037f570d5311a805633853a16dbc5a9 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 7 Jan 2025 13:42:03 +0100 Subject: [PATCH 03/46] fix reducers --- .../common/tensor_statistics/collectors.py | 21 +++++++++++++------ .../weight_compression/torch_backend.py | 6 +++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/nncf/experimental/common/tensor_statistics/collectors.py b/nncf/experimental/common/tensor_statistics/collectors.py index ce138709d71..dd4e15a114f 100644 --- a/nncf/experimental/common/tensor_statistics/collectors.py +++ b/nncf/experimental/common/tensor_statistics/collectors.py @@ -464,18 +464,27 @@ def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]: class MeanVarianceReducer(TensorReducerBase): - def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]: - raise NotImplementedError() + def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]: + x = x[0] + reduction_axes = self._get_reduction_axes(x) + variance = fns.var(x, reduction_axes) + return [fns.mean(variance)] class MaxVarianceReducer(TensorReducerBase): - def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]: - raise NotImplementedError() + def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]: + x = x[0] + reduction_axes = self._get_reduction_axes(x) + variance = fns.var(x, reduction_axes) + return [fns.max(variance)] class MeanAbsMaxReducer(TensorReducerBase): - def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]: - raise NotImplementedError() + def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]: + x = fns.abs(x[0]) + reduction_axes = self._get_reduction_axes(x) + abs_max = fns.max(x, reduction_axes, keepdims=self._keepdims) + return [fns.mean(abs_max)] class QuantileReducerBase(TensorReducerBase): diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 3c5914a41cb..7ade18cc8c9 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -306,7 +306,7 @@ class PTMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, PTWeightCompression def mean_variance_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MeanVarianceReducer(reduction_axes, inplace=True) + reducer = MeanVarianceReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MeanVarianceTensorStatistic) collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator) @@ -316,7 +316,7 @@ def mean_variance_statistic_collector( def max_variance_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MaxVarianceReducer(reduction_axes, inplace=True) + reducer = MaxVarianceReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MaxVarianceTensorStatistic) collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator) @@ -326,7 +326,7 @@ def max_variance_statistic_collector( def mean_abs_max_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MeanAbsMaxReducer(reduction_axes, inplace=True) + reducer = MeanAbsMaxReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) From db42165c7138f89f0ab9ad7c0f4253abf319330f Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 7 Jan 2025 15:50:05 +0100 Subject: [PATCH 04/46] align SE with GPTQ --- .../weight_compression/algorithm.py | 27 +++++----- .../weight_compression/scale_estimation.py | 54 ++++++++----------- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 14d0c800d32..8d8908bbe5b 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -266,6 +266,14 @@ def __init__( subset_size=gptq_params.subset_size, scale_estimation=self._scale_estimation, ) + if self._scale_estimation: + scale_estimation_params = self._advanced_parameters.scale_estimation_params + self._scale_estimation_algo = ScaleEstimation( + scale_estimation_params.subset_size, + scale_estimation_params.initial_steps, + scale_estimation_params.scale_steps, + scale_estimation_params.weight_penalty, + ) self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -616,18 +624,13 @@ def apply( ) else: if self._scale_estimation: - scale_estimation_params = self._advanced_parameters.scale_estimation_params - scales, zero_points = ScaleEstimation( - model, - self._backend_entity.name_to_node_mapping, - all_weight_params, - nodes_to_compress, - statistics, - scale_estimation_params.subset_size, - scale_estimation_params.initial_steps, - scale_estimation_params.scale_steps, - scale_estimation_params.weight_penalty, - ).apply(model, graph) + scales, zero_points = self._scale_estimation_algo.apply( + model=model, + graph=graph, + all_weight_params=all_weight_params, + statistics=statistics, + backend_entity=self._backend_entity, + ) if self._lora_correction: lora_correction_params = self._advanced_parameters.lora_correction_params diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 90ff789a429..f9d19c632e8 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -10,12 +10,11 @@ # limitations under the License. from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple, TypeVar +from typing import Dict, List, Optional, Tuple, TypeVar import nncf from nncf import Dataset from nncf.common.graph.graph import NNCFGraph -from nncf.common.graph.graph import NNCFNode from nncf.common.logging.track_progress import track from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer from nncf.common.utils.backend import BackendType @@ -48,22 +47,12 @@ class ScaleEstimation: def __init__( self, - model: TModel, - name_to_node_mapping: Dict[str, Any], - all_weight_params: List[WeightCompressionParameters], - nodes_to_compress: List[NNCFNode], - statistics: Dict[str, WCTensorStatistic], subset_size: int = 32, initial_steps: int = 5, scale_steps: int = 10, weight_penalty: float = -1.0, ): """ - :param model: Model for applying algorithm. - :param name_to_node_mapping: Name to node mapping for updating node weights. - :param all_weight_params: List of all weight parameters. - :param nodes_to_compress: List of nodes for processing. - :param statistics: Input activation statistics for each node. :param subset_size: The number of samples for scale estimation. :param initial_steps: The number of the steps for absmax scale rectification. :param scale_steps: The number of the steps for grid search scale rectification @@ -71,17 +60,11 @@ def __init__( :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply. """ super().__init__() - self.name_to_node_mapping = name_to_node_mapping - self._all_weight_params = all_weight_params - self._nodes_to_compress = nodes_to_compress - self._statistics = statistics self._subset_size = subset_size self._initial_steps = initial_steps self._scale_steps = scale_steps self._weight_penalty = weight_penalty - self._set_backend_entity(model) - @property def available_backends(self) -> List[BackendType]: return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX] @@ -91,35 +74,36 @@ def _set_backend_entity(self, model: TModel) -> None: Creates a helper class with a backed-specific logic of the algorithm. :param model: Backend-specific input model. - :param all_weight_params: List of all weight parameters. - :param nodes_to_compress: List of nodes for processing. - :param activations: The input activations of the layers considered for compression. """ - model_backend = get_backend(model) if model_backend == BackendType.OPENVINO: from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend - self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping) - if model_backend == BackendType.TORCH: + self._backend_entity = OVWeightCompressionAlgoBackend(model) + elif model_backend == BackendType.TORCH: from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend - self._backend_entity = PTWeightCompressionAlgoBackend(model, self.name_to_node_mapping) - if model_backend == BackendType.TORCH_FX: + self._backend_entity = PTWeightCompressionAlgoBackend() + elif model_backend == BackendType.TORCH_FX: from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend - self._backend_entity = FXWeightCompressionAlgoBackend(model, self.name_to_node_mapping) + self._backend_entity = FXWeightCompressionAlgoBackend() else: raise nncf.UnsupportedBackendError( - "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value) + "Cannot return backend-specific Scale Estimation entity because {} is not supported!".format( + model_backend.value + ) ) def apply( self, model: TModel, graph: NNCFGraph, + all_weight_params: List[WeightCompressionParameters], + statistics: Dict[str, WCTensorStatistic], statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, + backend_entity: Optional[WeightCompressionAlgoBackend] = None, ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: """ Estimates better scale for the int4 nodes in the model. @@ -127,26 +111,30 @@ def apply( MatMul with compressed weights. The algorithm computes weighted scale for the group of weights in MatMul, which shared the same scale. - + :param all_weight_params: List of all weight parameters. + :param statistics: Input activation statistics for each node. :param model: Model for applying algorithm. :param graph: Model graph. :param statistic_points: Statistic points with collected statistics values. :param dataset: A representative dataset for the calibration process. + :param backend_entity: Weight compression algorithm backend. :return: Two dictionaries for estimated scales and zero points for each weight name. """ - + self._backend_entity = backend_entity + if self._backend_entity is None: + self._set_backend_entity(model) scales, zero_points = dict(), dict() - for wp in track(self._all_weight_params, description="Applying Scale Estimation"): + for wp in track(all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name node_name = wp.node_with_weight.node_name config = wp.compression_config - if config.num_bits != 4 or node_name not in self._statistics: + if config.num_bits != 4 or node_name not in statistics: scales[weight_name] = None continue - stats = self._statistics[node_name] + stats = statistics[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm From f96788ae145af57ace91e804a373cf3beb490d1f Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 10:46:32 +0100 Subject: [PATCH 05/46] add tests --- .../test_reducers_and_aggregators.py | 45 +++++++++++++++++++ .../test_reducers_and_aggregators.py | 5 ++- .../ptq/test_weights_compression_backends.py | 38 ++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 tests/torch/ptq/test_weights_compression_backends.py diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py index a693efed0ac..db4b646caca 100644 --- a/tests/common/experimental/test_reducers_and_aggregators.py +++ b/tests/common/experimental/test_reducers_and_aggregators.py @@ -24,8 +24,11 @@ from nncf.experimental.common.tensor_statistics.collectors import AggregationAxes from nncf.experimental.common.tensor_statistics.collectors import HAWQAggregator from nncf.experimental.common.tensor_statistics.collectors import MaxAggregator +from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer +from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator from nncf.experimental.common.tensor_statistics.collectors import MeanNoOutliersAggregator +from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import MedianAbsoluteDeviationAggregator from nncf.experimental.common.tensor_statistics.collectors import MedianAggregator from nncf.experimental.common.tensor_statistics.collectors import MedianNoOutliersAggregator @@ -569,3 +572,45 @@ def test_hawq_aggregator(self, inputs, reference_output): ret_val = aggregator.aggregate() assert fns.allclose(ret_val, reference_output) + + @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) + def test_mean_variance_reducer(self, axes): + reducer = MeanVarianceReducer(reduction_axes=axes) + np_data = np.random.rand(3, 10, 10, 4) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + + # Calculate expected result using numpy + variance = np.var(np_data, axis=axes) + expected_result = np.mean(variance) + + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) + + @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) + def test_mean_abs_max_reducer(self, axes): + reducer = MeanAbsMaxReducer(reduction_axes=axes) + np_data = np.random.rand(3, 10, 10, 4) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + + # Calculate expected result using numpy + abs_max = np.max(np.abs(np_data), axis=axes) + expected_result = np.mean(abs_max) + + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) + + @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) + def test_max_variance_reducer(self, axes): + reducer = MaxVarianceReducer(reduction_axes=axes) + np_data = np.random.rand(3, 10, 10, 4) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + + # Calculate expected result using numpy + variance = np.var(np_data, axis=axes) + expected_result = np.max(variance) + + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) diff --git a/tests/openvino/native/quantization/test_reducers_and_aggregators.py b/tests/openvino/native/quantization/test_reducers_and_aggregators.py index eb2a23eaff2..1e306829fd1 100644 --- a/tests/openvino/native/quantization/test_reducers_and_aggregators.py +++ b/tests/openvino/native/quantization/test_reducers_and_aggregators.py @@ -75,12 +75,13 @@ def squeeze_tensor(self, ref_tensor: List[Any], axes: Optional[Tuple[int]] = Non def cast_tensor(self, tensor, dtype: Dtype): return tensor + @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("reducer_cls,reduction_axes,ref_value", MIXED_PRECISION_REDUCERS_REF_VALUES) - def test_mixed_precision_reducers(self, reducer_cls, reduction_axes, ref_value): + def test_mixed_precision_reducers(self, reducer_cls, reduction_axes, ref_value, inplace): input_ = np.arange(2 * 4 * 8).reshape(2, 4, 8) input_[:, :2] *= 2 - reducer = reducer_cls(reduction_axes=reduction_axes, inplace=True) + reducer = reducer_cls(reduction_axes=reduction_axes, inplace=inplace) inplace_fn = reducer.get_inplace_fn() ov_model_input = opset.parameter(input_.shape) diff --git a/tests/torch/ptq/test_weights_compression_backends.py b/tests/torch/ptq/test_weights_compression_backends.py new file mode 100644 index 00000000000..bd66093ec19 --- /dev/null +++ b/tests/torch/ptq/test_weights_compression_backends.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nncf.quantization.algorithms.weight_compression.mixed_precision import HAWQCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MaxVarianceCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanMaxCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanVarianceCriterion +from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend +from tests.cross_fw.test_templates.test_weights_compression_backends import TemplateTestMixedPrecisionAlgoBackend + + +class TestPTMixedPrecisionAlgoBackend(TemplateTestMixedPrecisionAlgoBackend): + def get_hawq_with_backend(self, subset_size): + hawq = HAWQCriterion(None, None, subset_size=subset_size) + hawq._backend_entity = PTMixedPrecisionAlgoBackend() + return hawq + + def get_mean_variance_with_backend(self, subset_size: int): + mean_variance = MeanVarianceCriterion(None, None, subset_size=subset_size) + mean_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return mean_variance + + def get_max_variance_with_backend(self, subset_size: int): + max_variance = MaxVarianceCriterion(None, None, subset_size=subset_size) + max_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return max_variance + + def get_mean_max_with_backend(self, subset_size: int): + mean_max_variance = MeanMaxCriterion(None, None, subset_size=subset_size) + mean_max_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return mean_max_variance From b1d4c477e85258307c302af80b20e81506dac6fa Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 10:58:49 +0100 Subject: [PATCH 06/46] backend method - get_filter_fn_for_statistics --- .../algorithms/weight_compression/backend.py | 14 +++++++++++++- .../weight_compression/mixed_precision.py | 18 ++++-------------- .../weight_compression/openvino_backend.py | 16 +++++++++++++++- .../weight_compression/torch_backend.py | 15 ++++++++++++++- .../weight_compression/torch_fx_backend.py | 15 ++++++++++++++- 5 files changed, 60 insertions(+), 18 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 004bb08baef..a4954c3ddd9 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -11,7 +11,7 @@ from abc import ABC from abc import abstractmethod -from typing import Dict, Iterable, List, Optional, Tuple, TypeVar +from typing import Callable, Dict, Iterable, List, Optional, Tuple, TypeVar from nncf.common.graph import NNCFGraph from nncf.common.graph import NNCFNode @@ -19,6 +19,7 @@ from nncf.common.graph.transformations.commands import TargetPoint from nncf.common.graph.transformations.commands import TargetType from nncf.common.tensor_statistics.collectors import TensorStatisticCollectorBase +from nncf.common.tensor_statistics.statistic_point import StatisticPoint from nncf.experimental.common.tensor_statistics.collectors import HAWQAggregator from nncf.experimental.common.tensor_statistics.collectors import RawReducer from nncf.experimental.common.tensor_statistics.collectors import TensorCollector @@ -278,3 +279,14 @@ def mean_abs_max_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: pass + + @staticmethod + @abstractmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + """ + Returns backend-specific callable to filter statistic containers according to its statistic point. + + :param activation_port_id: Activation port id for the statistic collection target node. + :param algorithm_key: Current algorithm key. + :return: Backend-specific callable to filter statistic containers according to its statistic point. + """ diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py index 900b639feeb..0c6775a2c11 100644 --- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -311,22 +311,12 @@ def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) - def _get_statistics_for_node( self, statistic_points: StatisticPointsContainer, node: NNCFNode, nncf_graph: NNCFGraph, stat_key: str ) -> List[Tensor]: - act_node, _ = self._get_activation_node_and_port(node, nncf_graph) - - def input_filter_func(point): - # For the floating-point statistics collected in POST_LAYER style, - # we also need to determine the output port id. - # For the cases when the layer has more than one (0) output port. - return ( - self._algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type in [TargetType.POST_LAYER_OPERATION, TargetType.OPERATOR_POST_HOOK] - # and point.target_point.port_id == output_port_id - # Add a unique filter func for backend?? - ) - + act_node, act_port_id = self._get_activation_node_and_port(node, nncf_graph) stats = [] for tensor_collector in statistic_points.get_algo_statistics_for_node( - act_node.node_name, input_filter_func, self._algorithm_key + act_node.node_name, + self._backend_entity.get_filter_fn_for_statistics(act_port_id, self._algorithm_key), + self._algorithm_key, ): statistics = tensor_collector.get_statistics() for data in statistics.get_data().values(): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 2917c4ada3d..7b9d04e5864 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -8,7 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Callable, Dict, Iterable, List, Optional, Tuple import openvino as ov from openvino import Type @@ -21,6 +21,7 @@ from nncf.common.graph.operator_metatypes import OperatorMetatype from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.utils import get_reduction_axes +from nncf.common.tensor_statistics.statistic_point import StatisticPoint from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator from nncf.experimental.common.tensor_statistics.collectors import TensorCollector @@ -53,6 +54,8 @@ from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType +OV_POST_LAYER_TARGET_TYPE = TargetType.POST_LAYER_OPERATION + class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): def __init__(self, model: ov.Model, name_to_node_mapping: Dict = None): @@ -432,3 +435,14 @@ def mean_abs_max_statistic_collector( collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) return collector + + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type == OV_POST_LAYER_TARGET_TYPE + and point.target_point.port_id == activation_port_id + ) + + return filter_func diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 7ade18cc8c9..4746d665ea5 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch @@ -21,6 +21,7 @@ from nncf.common.graph.operator_metatypes import OperatorMetatype from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.transformations.layout import TransformationLayout +from nncf.common.tensor_statistics.statistic_point import StatisticPoint from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator @@ -56,6 +57,8 @@ from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor +PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK + class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): TARGET_TYPE_TO_PT_INS_TYPE_MAP = { @@ -331,3 +334,13 @@ def mean_abs_max_statistic_collector( collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) return collector + + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type == PT_POST_LAYER_TARGET_TYPE + ) + + return filter_func diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 794bc4a6427..89c78b1a9f4 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch import torch.fx @@ -22,6 +22,7 @@ from nncf.common.graph.operator_metatypes import OperatorMetatype from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.transformations.layout import TransformationLayout +from nncf.common.tensor_statistics.statistic_point import StatisticPoint from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator @@ -58,6 +59,8 @@ from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor +PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK + class FXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): MATMUL_METATYPES = PTWeightCompressionAlgoBackend.MATMUL_METATYPES @@ -291,3 +294,13 @@ def mean_abs_max_statistic_collector( collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) return collector + + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type == PT_POST_LAYER_TARGET_TYPE + ) + + return filter_func From 51ccdd6c059605a7aaa872f937102d7499ad560c Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 11:09:08 +0100 Subject: [PATCH 07/46] fixes --- examples/llm_compression/torch/main.py | 4 ++-- .../algorithms/weight_compression/scale_estimation.py | 5 +++-- .../algorithms/weight_compression/torch_fx_backend.py | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py index 0a213dadcbd..a91508eef58 100644 --- a/examples/llm_compression/torch/main.py +++ b/examples/llm_compression/torch/main.py @@ -24,7 +24,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" -def quantize(model, tokenizer, dataset): +def quantize(model, dataset): quantization_dataset = nncf.Dataset(dataset) compressed_model = nncf.compress_weights( model, @@ -63,7 +63,7 @@ def transform_fn(data): dataset = dataset.map(transform_fn).with_format("torch", device=device) - quantize(model, tokenizer, dataset) + quantize(model, dataset) model = OVModelForCausalLM.from_pretrained( OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"} ) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index f9d19c632e8..c3fe7050e7c 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -111,10 +111,11 @@ def apply( MatMul with compressed weights. The algorithm computes weighted scale for the group of weights in MatMul, which shared the same scale. - :param all_weight_params: List of all weight parameters. - :param statistics: Input activation statistics for each node. + :param model: Model for applying algorithm. :param graph: Model graph. + :param all_weight_params: List of all weight parameters. + :param statistics: Input activation statistics for each node. :param statistic_points: Statistic points with collected statistics values. :param dataset: A representative dataset for the calibration process. :param backend_entity: Weight compression algorithm backend. diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 89c78b1a9f4..9764327b5d6 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -269,7 +269,7 @@ class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompression def mean_variance_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MeanVarianceReducer(reduction_axes, inplace=True) + reducer = MeanVarianceReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MeanVarianceTensorStatistic) collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator) @@ -279,7 +279,7 @@ def mean_variance_statistic_collector( def max_variance_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MaxVarianceReducer(reduction_axes, inplace=True) + reducer = MaxVarianceReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MaxVarianceTensorStatistic) collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator) @@ -289,7 +289,7 @@ def max_variance_statistic_collector( def mean_abs_max_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: - reducer = MeanAbsMaxReducer(reduction_axes, inplace=True) + reducer = MeanAbsMaxReducer(reduction_axes) aggregator = MeanAggregator(num_samples=subset_size) collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) From e37ef52c9aebd9aec7da6394428c1bea6d6d76db Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 11:35:36 +0100 Subject: [PATCH 08/46] sample --- examples/llm_compression/torch/{ => llama_3_2}/main.py | 4 ++-- examples/llm_compression/torch/llama_3_2/requirements.txt | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) rename examples/llm_compression/torch/{ => llama_3_2}/main.py (97%) create mode 100644 examples/llm_compression/torch/llama_3_2/requirements.txt diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/llama_3_2/main.py similarity index 97% rename from examples/llm_compression/torch/main.py rename to examples/llm_compression/torch/llama_3_2/main.py index a91508eef58..ff03296a47e 100644 --- a/examples/llm_compression/torch/main.py +++ b/examples/llm_compression/torch/llama_3_2/main.py @@ -19,8 +19,8 @@ import nncf -MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3" -OUTPUT_DIR = "tinyllama_compressed" +MODEL_ID = "unsloth/Llama-3.2-1B" +OUTPUT_DIR = "compressed" device = "cuda" if torch.cuda.is_available() else "cpu" diff --git a/examples/llm_compression/torch/llama_3_2/requirements.txt b/examples/llm_compression/torch/llama_3_2/requirements.txt new file mode 100644 index 00000000000..e29c588e595 --- /dev/null +++ b/examples/llm_compression/torch/llama_3_2/requirements.txt @@ -0,0 +1,4 @@ +transformers +datasets==3.2 +openvino==2024.6 +optimum-intel[openvino] From d1843ad71a6238eaf5d57fee57379d18822c58ae Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 12:15:45 +0100 Subject: [PATCH 09/46] add tinyllama_data_aware, tinyllama_scale_estimation_per_channel for torch --- tests/post_training/data/wc_reference_data.yaml | 8 ++++++++ tests/post_training/model_scope.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 6c48904c91a..3d27d81ee20 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -6,6 +6,10 @@ tinyllama_data_aware_backend_OV: metric_value: 0.85767 num_int4: 94 num_int8: 124 +tinyllama_data_aware_backend_TORCH: + metric_value: 0.85767 + num_int4: 94 + num_int8: 124 tinyllama_data_aware_awq_stateful_backend_OV: metric_value: 0.85616 num_int4: 94 @@ -31,6 +35,10 @@ tinyllama_scale_estimation_per_channel_backend_OV: metric_value: 0.81389 num_int4: 188 num_int8: 124 +tinyllama_scale_estimation_per_channel_backend_TORCH: + metric_value: 0.81389 + num_int4: 188 + num_int8: 124 tinyllama_data_aware_lora_stateful_backend_OV: metric_value: 0.83446 num_int4: 94 diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py index beea18e48fc..ad2dd36757d 100644 --- a/tests/post_training/model_scope.py +++ b/tests/post_training/model_scope.py @@ -389,7 +389,7 @@ "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b", "pipeline_cls": LMWeightCompression, "compression_params": {"group_size": 64, "ratio": 0.8, "mode": CompressWeightsMode.INT4_SYM}, - "backends": [BackendType.OV], + "backends": [BackendType.OV, BackendType.TORCH], }, { "reported_name": "tinyllama_data_aware_awq_stateful", @@ -496,7 +496,7 @@ "mode": CompressWeightsMode.INT4_ASYM, "scale_estimation": True, }, - "backends": [BackendType.OV], + "backends": [BackendType.OV, BackendType.TORCH], }, { "reported_name": "tinyllama_data_aware_lora_stateful", From cd79e80c3affc8342cb6d7689b4442b834a618a9 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 12:17:54 +0100 Subject: [PATCH 10/46] fix precommit --- tests/torch/experimental/sparsify_activations/test_algo.py | 2 -- .../experimental/sparsify_activations/test_components.py | 6 +----- tests/torch/ptq/test_weights_compression.py | 2 -- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/torch/experimental/sparsify_activations/test_algo.py b/tests/torch/experimental/sparsify_activations/test_algo.py index c5616aa8372..64d245d3639 100644 --- a/tests/torch/experimental/sparsify_activations/test_algo.py +++ b/tests/torch/experimental/sparsify_activations/test_algo.py @@ -128,7 +128,6 @@ class SparsifyActivationsAlgorithmTestDesc: @pytest.mark.parametrize("compress_weights", [False, True], scope="class") @pytest.mark.parametrize("use_cuda", [False, True], ids=["cpu", "cuda"], scope="class") class TestSparsifyActivationsAlgorithm: - @pytest.fixture(autouse=True, scope="class") def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_weights: bool, use_cuda: bool): if use_cuda and not torch.cuda.is_available(): @@ -146,7 +145,6 @@ def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_we model = nncf.compress_weights( model, mode=nncf.CompressWeightsMode.INT8_SYM, - dataset=dataset, ) model = nncf.experimental.torch.sparsify_activations.sparsify_activations( model=model, diff --git a/tests/torch/experimental/sparsify_activations/test_components.py b/tests/torch/experimental/sparsify_activations/test_components.py index f74ceab3c1b..938a4facf1b 100644 --- a/tests/torch/experimental/sparsify_activations/test_components.py +++ b/tests/torch/experimental/sparsify_activations/test_components.py @@ -217,11 +217,7 @@ def create_model_and_dataset(self, compress_weights: bool = False): model = ThreeLinearModel() dataset = nncf.Dataset(torch.randint(0, 30, (3, 2, 8))) if compress_weights: - model = nncf.compress_weights( - model, - mode=nncf.CompressWeightsMode.INT8_SYM, - dataset=dataset, - ) + model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM) else: model = wrap_model( model, diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 12f113863f7..83e211a5bd2 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -236,10 +236,8 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params): @pytest.mark.parametrize( "params", ( - *({"sensitivity_metric": metric} for metric in DATA_BASED_SENSITIVITY_METRICS), {"gptq": True}, {"awq": True}, - {"scale_estimation": True}, {"lora_correction": True}, ), ) From df6b43ba0cf9d75158dd63d31c6dee1f84173d49 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 12:30:21 +0100 Subject: [PATCH 11/46] minor --- tests/torch/ptq/test_weights_compression.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 83e211a5bd2..091e9192fe4 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -30,14 +30,7 @@ from nncf.torch.quantization.quantize_functions import unpack_uint4 from tests.torch.test_models.synthetic import ShortTransformer -DATA_BASED_SENSITIVITY_METRICS = ( - SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - SensitivityMetric.MEAN_ACTIVATION_VARIANCE, - SensitivityMetric.MAX_ACTIVATION_VARIANCE, - SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, -) - -ALL_SENSITIVITY_METRICS = DATA_BASED_SENSITIVITY_METRICS + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,) +ALL_SENSITIVITY_METRICS = all_sensitivity_metrics = list(SensitivityMetric) INT8_MODES = (CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM) INT4_MODES = (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM) From 368054a28985093b141158eea83c53554cc9c15e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 12:47:45 +0100 Subject: [PATCH 12/46] refactor test --- .../test_reducers_and_aggregators.py | 90 ++++++++++--------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py index 5b6c2f45339..5f83bf64875 100644 --- a/tests/common/experimental/test_reducers_and_aggregators.py +++ b/tests/common/experimental/test_reducers_and_aggregators.py @@ -235,6 +235,54 @@ def test_quantile_reducers(self, reducer_name, ref, reducers): for i, ref_ in enumerate(ref): assert fns.allclose(val[i], self.get_nncf_tensor(ref_)) + @pytest.mark.parametrize( + "axes, np_data, reference", + [ + [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 14.25], + [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 15.875], + [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + ], + ) + def test_mean_variance_reducer(self, axes, np_data, reference): + reducer = MeanVarianceReducer(reduction_axes=axes) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(reference)) + + @pytest.mark.parametrize( + "axes, np_data, reference", + [ + [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0], + [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 4.16666], + [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 6.33333], + [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0], + ], + ) + def test_mean_abs_max_reducer(self, axes, np_data, reference): + reducer = MeanAbsMaxReducer(reduction_axes=axes) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(reference)) + + @pytest.mark.parametrize( + "axes, np_data, reference", + [ + [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 64.0], + [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 36.1875], + [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + ], + ) + def test_max_variance_reducer(self, axes, np_data, reference): + reducer = MaxVarianceReducer(reduction_axes=axes) + nncf_data = self.get_nncf_tensor(np_data) + result = reducer._reduce_out_of_place([nncf_data]) + assert len(result) == 1 + assert fns.allclose(result[0], self.get_nncf_tensor(reference)) + @pytest.mark.parametrize( "reducer_name,ref,kwargs", [ @@ -572,45 +620,3 @@ def test_hawq_aggregator(self, inputs, reference_output): ret_val = aggregator.aggregate() assert fns.allclose(ret_val, reference_output) - - @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) - def test_mean_variance_reducer(self, axes): - reducer = MeanVarianceReducer(reduction_axes=axes) - np_data = np.random.rand(3, 10, 10, 4) - nncf_data = self.get_nncf_tensor(np_data) - result = reducer._reduce_out_of_place([nncf_data]) - - # Calculate expected result using numpy - variance = np.var(np_data, axis=axes) - expected_result = np.mean(variance) - - assert len(result) == 1 - assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) - - @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) - def test_mean_abs_max_reducer(self, axes): - reducer = MeanAbsMaxReducer(reduction_axes=axes) - np_data = np.random.rand(3, 10, 10, 4) - nncf_data = self.get_nncf_tensor(np_data) - result = reducer._reduce_out_of_place([nncf_data]) - - # Calculate expected result using numpy - abs_max = np.max(np.abs(np_data), axis=axes) - expected_result = np.mean(abs_max) - - assert len(result) == 1 - assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) - - @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)]) - def test_max_variance_reducer(self, axes): - reducer = MaxVarianceReducer(reduction_axes=axes) - np_data = np.random.rand(3, 10, 10, 4) - nncf_data = self.get_nncf_tensor(np_data) - result = reducer._reduce_out_of_place([nncf_data]) - - # Calculate expected result using numpy - variance = np.var(np_data, axis=axes) - expected_result = np.max(variance) - - assert len(result) == 1 - assert fns.allclose(result[0], self.get_nncf_tensor(expected_result)) From e2a6f46fc6d2c221b6f019ceaa1e6de12b6e78e3 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 12:54:35 +0100 Subject: [PATCH 13/46] add WA for dataset --- nncf/quantization/quantize_model.py | 2 ++ .../torch/experimental/sparsify_activations/test_components.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 7e99a7e71a4..64ba790c264 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -539,6 +539,8 @@ def compress_weights( else: example_input = next(iter(dataset.get_inference_data())) model = wrap_model(model, example_input=example_input, trace_parameters=True) + if mode in (CompressWeightsMode.INT8, CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM): + dataset = None # workaround as INT8 mode still does not support dataset compression_weights_impl = pt_compression_weights_impl if backend == BackendType.TORCH_FX: diff --git a/tests/torch/experimental/sparsify_activations/test_components.py b/tests/torch/experimental/sparsify_activations/test_components.py index 938a4facf1b..09ed2adc740 100644 --- a/tests/torch/experimental/sparsify_activations/test_components.py +++ b/tests/torch/experimental/sparsify_activations/test_components.py @@ -217,7 +217,7 @@ def create_model_and_dataset(self, compress_weights: bool = False): model = ThreeLinearModel() dataset = nncf.Dataset(torch.randint(0, 30, (3, 2, 8))) if compress_weights: - model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM) + model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM, dataset=dataset) else: model = wrap_model( model, From dbf2b1df017374c925bd10416e6c1d522572737b Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 13:07:41 +0100 Subject: [PATCH 14/46] fix --- tests/torch/experimental/sparsify_activations/test_algo.py | 2 ++ tests/torch/ptq/test_weights_compression.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/torch/experimental/sparsify_activations/test_algo.py b/tests/torch/experimental/sparsify_activations/test_algo.py index 64d245d3639..c5616aa8372 100644 --- a/tests/torch/experimental/sparsify_activations/test_algo.py +++ b/tests/torch/experimental/sparsify_activations/test_algo.py @@ -128,6 +128,7 @@ class SparsifyActivationsAlgorithmTestDesc: @pytest.mark.parametrize("compress_weights", [False, True], scope="class") @pytest.mark.parametrize("use_cuda", [False, True], ids=["cpu", "cuda"], scope="class") class TestSparsifyActivationsAlgorithm: + @pytest.fixture(autouse=True, scope="class") def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_weights: bool, use_cuda: bool): if use_cuda and not torch.cuda.is_available(): @@ -145,6 +146,7 @@ def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_we model = nncf.compress_weights( model, mode=nncf.CompressWeightsMode.INT8_SYM, + dataset=dataset, ) model = nncf.experimental.torch.sparsify_activations.sparsify_activations( model=model, diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 091e9192fe4..979326cfb36 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -30,7 +30,7 @@ from nncf.torch.quantization.quantize_functions import unpack_uint4 from tests.torch.test_models.synthetic import ShortTransformer -ALL_SENSITIVITY_METRICS = all_sensitivity_metrics = list(SensitivityMetric) +ALL_SENSITIVITY_METRICS = list(SensitivityMetric) INT8_MODES = (CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM) INT4_MODES = (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM) From 702f8b10b09226c6dea892b4de21309ae575f88f Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 13:27:51 +0100 Subject: [PATCH 15/46] dtype --- tests/common/experimental/test_reducers_and_aggregators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py index 5f83bf64875..7d60f0fc01a 100644 --- a/tests/common/experimental/test_reducers_and_aggregators.py +++ b/tests/common/experimental/test_reducers_and_aggregators.py @@ -246,7 +246,7 @@ def test_quantile_reducers(self, reducer_name, ref, reducers): ) def test_mean_variance_reducer(self, axes, np_data, reference): reducer = MeanVarianceReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data) + nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) @@ -262,7 +262,7 @@ def test_mean_variance_reducer(self, axes, np_data, reference): ) def test_mean_abs_max_reducer(self, axes, np_data, reference): reducer = MeanAbsMaxReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data) + nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) @@ -278,7 +278,7 @@ def test_mean_abs_max_reducer(self, axes, np_data, reference): ) def test_max_variance_reducer(self, axes, np_data, reference): reducer = MaxVarianceReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data) + nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) From 24e39c23379571f8ad5640a6a192ee9e8d96f260 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 8 Jan 2025 13:28:54 +0100 Subject: [PATCH 16/46] polishing --- .../algorithms/weight_compression/openvino_backend.py | 6 ++---- .../algorithms/weight_compression/torch_backend.py | 5 ++--- .../algorithms/weight_compression/torch_fx_backend.py | 5 ++--- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index cda28b0c9d3..90005c09028 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -54,8 +54,6 @@ from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType -OV_POST_LAYER_TARGET_TYPE = TargetType.POST_LAYER_OPERATION - class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): def __init__(self, model: ov.Model, name_to_node_mapping: Dict = None): @@ -108,7 +106,7 @@ def mean_statistic_collector( @staticmethod def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int: - if node.layer_attributes.input_attributes["transpose"]: # It works only for OV + if node.layer_attributes.input_attributes["transpose"]: raise nncf.UnsupportedModelError("Transposed input is not supported") constant_ports = node.layer_attributes.get_const_port_ids() activation_ports = [ @@ -441,7 +439,7 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> def filter_func(point: StatisticPoint) -> bool: return ( algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == OV_POST_LAYER_TARGET_TYPE + and point.target_point.type == TargetType.POST_LAYER_OPERATION and point.target_point.port_id == activation_port_id ) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index acf8e081bed..9ccf63977ff 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -57,8 +57,6 @@ from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK - class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): TARGET_TYPE_TO_PT_INS_TYPE_MAP = { @@ -340,7 +338,8 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> def filter_func(point: StatisticPoint) -> bool: return ( algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == PT_POST_LAYER_TARGET_TYPE + and point.target_point.type + == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] ) return filter_func diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 032d77983e1..449957d3ecb 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -59,8 +59,6 @@ from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK - class FXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): MATMUL_METATYPES = PTWeightCompressionAlgoBackend.MATMUL_METATYPES @@ -300,7 +298,8 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> def filter_func(point: StatisticPoint) -> bool: return ( algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == PT_POST_LAYER_TARGET_TYPE + and point.target_point.type + == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] ) return filter_func From e97078b71fc6f9cb7cc2f08fe94a3c4071ef6030 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 10:28:34 +0100 Subject: [PATCH 17/46] updates for torch --- .../weight_compression/algorithm.py | 15 +-- .../algorithms/weight_compression/backend.py | 22 ++-- .../weight_compression/openvino_backend.py | 23 ++-- .../weight_compression/scale_estimation.py | 9 +- .../weight_compression/torch_backend.py | 105 ++++++++++++++++-- nncf/torch/engine.py | 11 +- nncf/torch/quantization/layers.py | 4 +- 7 files changed, 133 insertions(+), 56 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index f85bd74a8e4..7973712da9a 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -12,7 +12,6 @@ import operator from collections import OrderedDict from collections import defaultdict -from functools import partial from functools import reduce from typing import Any, Dict, Iterable, List, Optional, Tuple, TypeVar @@ -812,16 +811,6 @@ def _get_statistics_for_weights_compression( :return: Collected statistics. """ - def input_filter_func(point, port_id): - # For the floating-point statistics collected in POST_LAYER style, - # we also need to determine the output port id. - # For the cases when the layer has more than one (0) output port. - return ( - self._algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == TargetType.POST_LAYER_OPERATION - and point.target_point.port_id == port_id - ) - # For each node we store statistics in a WCTensorStatistics data-class. It contains the following fields: # mean_values=[mean_value_1, ..., mean_value_n] # shapes=[shape_1, ..., shape_n] @@ -831,7 +820,9 @@ def input_filter_func(point, port_id): for (act_node, output_port_id), matmul_nodes in matmul_input_to_output_nodes_map.items(): tensor_collectors = list( statistic_points.get_algo_statistics_for_node( - act_node.node_name, partial(input_filter_func, port_id=output_port_id), self._algorithm_key + act_node.node_name, + self._backend_entity.get_filter_fn_for_statistics(output_port_id, self._algorithm_key), + self._algorithm_key, ) ) # Statistics could be empty in case when the statistics is registered for another algorithm, diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 9f1f97ef928..c8ea964a288 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -235,6 +235,17 @@ def dump_parameters( :param path: Optional list of the paths. """ + @staticmethod + @abstractmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + """ + Returns backend-specific callable to filter statistic containers according to its statistic point. + + :param activation_port_id: Activation port id for the statistic collection target node. + :param algorithm_key: Current algorithm key. + :return: Backend-specific callable to filter statistic containers according to its statistic point. + """ + class AWQAlgoBackend(WeightCompressionAlgoBackend): @staticmethod @@ -279,14 +290,3 @@ def mean_abs_max_statistic_collector( reduction_axes: Tuple[int], subset_size: Optional[int] = None ) -> TensorCollector: pass - - @staticmethod - @abstractmethod - def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: - """ - Returns backend-specific callable to filter statistic containers according to its statistic point. - - :param activation_port_id: Activation port id for the statistic collection target node. - :param algorithm_key: Current algorithm key. - :return: Backend-specific callable to filter statistic containers according to its statistic point. - """ diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 90005c09028..53cf2c6a03f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -374,6 +374,7 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p s = opset.parameter(s_shape, name="s") parameters = [w, s] compressed_w = w / s + compressed_w.get_rt_info()["nonconvertable_divide_0"] = True if z_p_shape is not None: zp = opset.parameter(z_p_shape, name="zp") parameters.append(zp) @@ -390,6 +391,17 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p return lambda parameters: compiled_model(parameters)[0] + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type == TargetType.POST_LAYER_OPERATION + and point.target_point.port_id == activation_port_id + ) + + return filter_func + class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod @@ -433,14 +445,3 @@ def mean_abs_max_statistic_collector( collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) return collector - - @staticmethod - def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: - def filter_func(point: StatisticPoint) -> bool: - return ( - algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type == TargetType.POST_LAYER_OPERATION - and point.target_point.port_id == activation_port_id - ) - - return filter_func diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6fae3ed905f..b120092f6ca 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -237,7 +237,7 @@ def calculate_quantization_params( # normalize importances for every group of weights to make sum of them equal to 1.0 denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) + importance = importance / (denum + eps) # for each weight in a group X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) @@ -274,11 +274,11 @@ def calculate_quantization_params( zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - input_tensors = [original_weight.data, None] + input_tensors = [original_weight.data, None, None] if zp is not None: - input_tensors.append(zp.data) + input_tensors[2] = zp.data # iterative rectification of initial scale - for i in range(initial_steps): + for i in range(initial_steps): # make several iteration of updating scale near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign input_tensors[1] = near_to_ideal_scale.data @@ -406,6 +406,7 @@ def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: + # ideal scale to determine the importance of the weights """ Estimates scales for the given weight, target, zero mask, and importance. diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 9ccf63977ff..09cd811f206 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -12,6 +12,7 @@ from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch +import torch.nn as nn import nncf from nncf.common.graph.definitions import NNCFGraphNodeType @@ -37,6 +38,7 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight @@ -58,6 +60,79 @@ from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor +class CompressModule(nn.Module): + def __init__(self, level_low, level_high): + super().__init__() + self.level_low = level_low + self.level_high = level_high + + def forward(self, tensor, scale, zero_point=None): + # Compressed weights: (w / s) + optional zp + x = tensor / scale + if zero_point is not None: + x = x + zero_point + x = torch.round(x) + x = torch.clamp(x, min=self.level_low, max=self.level_high) + return x + + +class CompressDecompressModule(nn.Module): + def __init__(self, compress_mod): + super().__init__() + self.compress_mod = compress_mod + + def forward(self, tensor, scale, zero_point=None): + # Step 1: compress + clamp_out = self.compress_mod(tensor, scale, zero_point) + + # Step 2: decompress + if zero_point is not None: + out = (clamp_out - zero_point) * scale + else: + out = clamp_out * scale + return out + + +def get_compress_pipeline(mode: CompressWeightsMode, num_bits: int, use_torchscript=False): + asym_quant = mode in [CompressWeightsMode.INT4_ASYM] + level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 + + compress_module = CompressModule(level_low, level_high) + + # Optionally compile with TorchScript + if use_torchscript: + compress_module = torch.jit.script(compress_module) + + def _forward_fn(tensor, scale, zero_point): + with torch.no_grad(): + return compress_module(tensor, scale, zero_point) + + return _forward_fn + + +def get_compress_decompress_pipeline(mode, num_bits, use_torchscript=False): + compress_module = get_compress_pipeline( + mode=mode, + num_bits=num_bits, + use_torchscript=False, # We'll handle TorchScript in the final module + ) + + cdc_module = CompressDecompressModule(compress_module) + + # Optionally compile entire compress+decompress pipeline with TorchScript + if use_torchscript: + cdc_module = torch.jit.script(cdc_module) + + # Return a simple callable + def _forward_fn(parameters): + w, s, zp = parameters + with torch.no_grad(): + return cdc_module(w, s, zp) + + return _forward_fn + + class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): TARGET_TYPE_TO_PT_INS_TYPE_MAP = { TargetType.PRE_LAYER_OPERATION: TargetType.OPERATOR_PRE_HOOK, @@ -210,6 +285,25 @@ def insert_adapters( ) -> None: pass + @staticmethod + def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): + return get_compress_decompress_pipeline(config.mode, config.num_bits, True) + + @staticmethod + def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False): + return get_compress_pipeline(config.mode, config.num_bits, True) + + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type + == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] + ) + + return filter_func + def transform_model( self, model: NNCFNetwork, @@ -332,14 +426,3 @@ def mean_abs_max_statistic_collector( collector = TensorCollector(MeanMagnitudeTensorStatistic) collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) return collector - - @staticmethod - def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: - def filter_func(point: StatisticPoint) -> bool: - return ( - algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type - == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] - ) - - return filter_func diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py index ed70e8fb3a3..27b7cc5e706 100644 --- a/nncf/torch/engine.py +++ b/nncf/torch/engine.py @@ -44,9 +44,10 @@ def infer( :param input_data: Inputs for the model. :return: Model outputs. """ + with torch.no_grad(): + if isinstance(input_data, dict): + return self._model(**input_data) + if isinstance(input_data, tuple): + return self._model(*input_data) - if isinstance(input_data, dict): - return self._model(**input_data) - if isinstance(input_data, tuple): - return self._model(*input_data) - return self._model(input_data) + return self._model(input_data) diff --git a/nncf/torch/quantization/layers.py b/nncf/torch/quantization/layers.py index 13d9655cb81..9bd44b7fc22 100644 --- a/nncf/torch/quantization/layers.py +++ b/nncf/torch/quantization/layers.py @@ -1094,7 +1094,7 @@ def __init__(self, scale: torch.Tensor, zero_point: torch.Tensor, result_dtype: """ super().__init__() self.register_buffer("_scale", scale.type(dtype=torch.float16)) - self.register_buffer("_zero_point", self.pack_weight(zero_point)) + self.register_buffer("_zero_point", self.pack_weight(zero_point.type(dtype=torch.uint8))) self.result_dtype = result_dtype @property @@ -1165,7 +1165,7 @@ def __init__( self.register_buffer("_scale", scale.type(dtype=torch.float16)) self.zero_point_shape = zero_point.shape - self.register_buffer("_zero_point", self.pack_weight(zero_point)) + self.register_buffer("_zero_point", self.pack_weight(zero_point.type(dtype=torch.uint8))) self.compressed_weight_shape = compressed_weight_shape self.result_shape = result_shape From 035a6688de5b1b2678ff0083d5e39c0b44d57ad3 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 11:12:20 +0100 Subject: [PATCH 18/46] add functions --- .../weight_compression/torch_backend.py | 93 ++++++++----------- 1 file changed, 38 insertions(+), 55 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 09cd811f206..19fb1c2586c 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -12,7 +12,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch -import torch.nn as nn import nncf from nncf.common.graph.definitions import NNCFGraphNodeType @@ -60,75 +59,59 @@ from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -class CompressModule(nn.Module): - def __init__(self, level_low, level_high): - super().__init__() - self.level_low = level_low - self.level_high = level_high +def compress( + tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int +) -> torch.Tensor: + x = tensor / scale + if zero_point is not None: + x = x + zero_point + x = torch.round(x) + x = torch.clamp(x, min=level_low, max=level_high) + return x - def forward(self, tensor, scale, zero_point=None): - # Compressed weights: (w / s) + optional zp - x = tensor / scale - if zero_point is not None: - x = x + zero_point - x = torch.round(x) - x = torch.clamp(x, min=self.level_low, max=self.level_high) - return x +def decompress(compressed: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None) -> torch.Tensor: + if zero_point is not None: + return (compressed - zero_point) * scale + return compressed * scale -class CompressDecompressModule(nn.Module): - def __init__(self, compress_mod): - super().__init__() - self.compress_mod = compress_mod - def forward(self, tensor, scale, zero_point=None): - # Step 1: compress - clamp_out = self.compress_mod(tensor, scale, zero_point) +def compress_decompress( + tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int +) -> torch.Tensor: + x = compress(tensor, scale, zero_point, level_low, level_high) + # Step 2: decompress + if zero_point is not None: + x = (x - zero_point) * scale + else: + x = x * scale + return x - # Step 2: decompress - if zero_point is not None: - out = (clamp_out - zero_point) * scale - else: - out = clamp_out * scale - return out - -def get_compress_pipeline(mode: CompressWeightsMode, num_bits: int, use_torchscript=False): +def get_compress_fn(mode, num_bits: int): asym_quant = mode in [CompressWeightsMode.INT4_ASYM] level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - compress_module = CompressModule(level_low, level_high) - - # Optionally compile with TorchScript - if use_torchscript: - compress_module = torch.jit.script(compress_module) - - def _forward_fn(tensor, scale, zero_point): + def _forward_fn(inputs): + tensor, scale, zero_point = inputs with torch.no_grad(): - return compress_module(tensor, scale, zero_point) + return compress(tensor, scale, zero_point, level_low, level_high) return _forward_fn -def get_compress_decompress_pipeline(mode, num_bits, use_torchscript=False): - compress_module = get_compress_pipeline( - mode=mode, - num_bits=num_bits, - use_torchscript=False, # We'll handle TorchScript in the final module - ) - - cdc_module = CompressDecompressModule(compress_module) - - # Optionally compile entire compress+decompress pipeline with TorchScript - if use_torchscript: - cdc_module = torch.jit.script(cdc_module) +def get_compress_decompress_fn(mode, num_bits: int): + asym_quant = mode in [CompressWeightsMode.INT4_ASYM] + level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - # Return a simple callable - def _forward_fn(parameters): - w, s, zp = parameters + def _forward_fn(inputs): + tensor, scale, zero_point = inputs with torch.no_grad(): - return cdc_module(w, s, zp) + return compress_decompress( + tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high + ) return _forward_fn @@ -287,11 +270,11 @@ def insert_adapters( @staticmethod def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): - return get_compress_decompress_pipeline(config.mode, config.num_bits, True) + return get_compress_decompress_fn(config.mode, config.num_bits) @staticmethod def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False): - return get_compress_pipeline(config.mode, config.num_bits, True) + return get_compress_fn(config.mode, config.num_bits) @staticmethod def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: From 58b992428bb319c747869a0f760c2f6aac36740c Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 11:15:26 +0100 Subject: [PATCH 19/46] upd metrics --- tests/post_training/data/wc_reference_data.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 3d27d81ee20..8d47e097953 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -36,7 +36,7 @@ tinyllama_scale_estimation_per_channel_backend_OV: num_int4: 188 num_int8: 124 tinyllama_scale_estimation_per_channel_backend_TORCH: - metric_value: 0.81389 + metric_value: 0.80799 num_int4: 188 num_int8: 124 tinyllama_data_aware_lora_stateful_backend_OV: From be3694beebd463e780675a34db2b557d56a431aa Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 14:16:51 +0100 Subject: [PATCH 20/46] rm ov flag --- .../algorithms/weight_compression/openvino_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 53cf2c6a03f..fe22b5fdaec 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -374,7 +374,6 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p s = opset.parameter(s_shape, name="s") parameters = [w, s] compressed_w = w / s - compressed_w.get_rt_info()["nonconvertable_divide_0"] = True if z_p_shape is not None: zp = opset.parameter(z_p_shape, name="zp") parameters.append(zp) From 9345e2fe6277b4b6ffb3e05d856d98566d464b75 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 14:17:56 +0100 Subject: [PATCH 21/46] rm example --- .../llm_compression/torch/llama_3_2/main.py | 74 ------------------- .../torch/llama_3_2/requirements.txt | 4 - 2 files changed, 78 deletions(-) delete mode 100644 examples/llm_compression/torch/llama_3_2/main.py delete mode 100644 examples/llm_compression/torch/llama_3_2/requirements.txt diff --git a/examples/llm_compression/torch/llama_3_2/main.py b/examples/llm_compression/torch/llama_3_2/main.py deleted file mode 100644 index ff03296a47e..00000000000 --- a/examples/llm_compression/torch/llama_3_2/main.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time - -import torch -from datasets import load_dataset -from optimum.exporters.openvino.convert import export_from_model -from optimum.intel.openvino import OVModelForCausalLM -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer - -import nncf - -MODEL_ID = "unsloth/Llama-3.2-1B" -OUTPUT_DIR = "compressed" -device = "cuda" if torch.cuda.is_available() else "cpu" - - -def quantize(model, dataset): - quantization_dataset = nncf.Dataset(dataset) - compressed_model = nncf.compress_weights( - model, - dataset=quantization_dataset, - mode=nncf.CompressWeightsMode.INT4_SYM, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) - compressed_model.to("cpu") # issue with cuda export - export_from_model(compressed_model, OUTPUT_DIR, stateful=False, compression_option="fp32", device="cpu") - - -def validate(tokenizer, model): - input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device) - - start_t = time.time() - output = model.generate(**input_ids, max_new_tokens=100) - print("Elapsed time: ", time.time() - start_t) - - output_text = tokenizer.decode(output[0]) - print(output_text) - return output_text - - -def main(): - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID, load_in_8bit=False).to(device) - model.eval() - - dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - # dataset = dataset.filter(lambda example: len(example["text"]) > 128) # THIS LEADS TO A WORSE RESULT ON VALIDATION - - def transform_fn(data): - tokenized_text = tokenizer(data["text"], return_tensors="pt").to(device) - return tokenized_text.data # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT) - - dataset = dataset.map(transform_fn).with_format("torch", device=device) - - quantize(model, dataset) - model = OVModelForCausalLM.from_pretrained( - OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"} - ) - validate(tokenizer, model) - - -if __name__ == "__main__": - main() diff --git a/examples/llm_compression/torch/llama_3_2/requirements.txt b/examples/llm_compression/torch/llama_3_2/requirements.txt deleted file mode 100644 index e29c588e595..00000000000 --- a/examples/llm_compression/torch/llama_3_2/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -transformers -datasets==3.2 -openvino==2024.6 -optimum-intel[openvino] From 6e7d9819537643b2ec621f5329e352807d4b3684 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 15 Jan 2025 14:19:19 +0100 Subject: [PATCH 22/46] rm comments --- .../algorithms/weight_compression/scale_estimation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index b120092f6ca..e0cd43fa29c 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -237,7 +237,7 @@ def calculate_quantization_params( # normalize importances for every group of weights to make sum of them equal to 1.0 denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) # for each weight in a group + importance = importance / (denum + eps) X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) @@ -278,7 +278,7 @@ def calculate_quantization_params( if zp is not None: input_tensors[2] = zp.data # iterative rectification of initial scale - for i in range(initial_steps): # make several iteration of updating scale + for i in range(initial_steps): near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign input_tensors[1] = near_to_ideal_scale.data @@ -406,7 +406,6 @@ def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: - # ideal scale to determine the importance of the weights """ Estimates scales for the given weight, target, zero mask, and importance. From 683cfd490deab2226ad213a6dba536a6d80bcf9f Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Thu, 16 Jan 2025 10:01:57 +0100 Subject: [PATCH 23/46] fix tests --- .../weight_compression/scale_estimation.py | 4 ++-- .../algorithms/weight_compression/torch_backend.py | 12 ++++++++++-- .../weight_compression/torch_fx_backend.py | 11 +++++++++++ tests/torch/fx/test_compress_weights.py | 9 ++++++++- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index e0cd43fa29c..6fae3ed905f 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -274,9 +274,9 @@ def calculate_quantization_params( zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - input_tensors = [original_weight.data, None, None] + input_tensors = [original_weight.data, None] if zp is not None: - input_tensors[2] = zp.data + input_tensors.append(zp.data) # iterative rectification of initial scale for i in range(initial_steps): near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 19fb1c2586c..649eb779775 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -94,7 +94,11 @@ def get_compress_fn(mode, num_bits: int): level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 def _forward_fn(inputs): - tensor, scale, zero_point = inputs + if len(inputs) == 3: + tensor, scale, zero_point = inputs + else: + tensor, scale = inputs + zero_point = None with torch.no_grad(): return compress(tensor, scale, zero_point, level_low, level_high) @@ -107,7 +111,11 @@ def get_compress_decompress_fn(mode, num_bits: int): level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 def _forward_fn(inputs): - tensor, scale, zero_point = inputs + if len(inputs) == 3: + tensor, scale, zero_point = inputs + else: + tensor, scale = inputs + zero_point = None with torch.no_grad(): return compress_decompress( tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 449957d3ecb..8b14f441669 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -174,6 +174,17 @@ def insert_adapters( ) -> None: pass + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type + == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] + ) + + return filter_func + def transform_model( self, model: torch.fx.GraphModule, diff --git a/tests/torch/fx/test_compress_weights.py b/tests/torch/fx/test_compress_weights.py index 36b3e575db2..9d5bb0d880d 100644 --- a/tests/torch/fx/test_compress_weights.py +++ b/tests/torch/fx/test_compress_weights.py @@ -17,6 +17,7 @@ import nncf from nncf import BackupMode from nncf import CompressWeightsMode +from nncf import SensitivityMetric from nncf.common.factory import NNCFGraphFactory from nncf.data.dataset import Dataset from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node @@ -25,7 +26,6 @@ from nncf.torch.dynamic_graph.patch_pytorch import disable_patching from tests.torch.fx.helpers import get_torch_fx_model from tests.torch.ptq.test_weights_compression import ALL_SENSITIVITY_METRICS -from tests.torch.ptq.test_weights_compression import DATA_BASED_SENSITIVITY_METRICS from tests.torch.ptq.test_weights_compression import INT4_MODES from tests.torch.ptq.test_weights_compression import INT8_MODES from tests.torch.ptq.test_weights_compression import SUPPORTED_MODES @@ -37,6 +37,13 @@ from tests.torch.ptq.test_weights_compression import MatMulModel from tests.torch.test_models.synthetic import ShortTransformer +DATA_BASED_SENSITIVITY_METRICS = ( + SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + SensitivityMetric.MEAN_ACTIVATION_VARIANCE, + SensitivityMetric.MAX_ACTIVATION_VARIANCE, + SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, +) + def get_model_size(model): param_size = 0 From 1a33369df2b7e6d5dc1ee79fecbb027dc51464c2 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Thu, 16 Jan 2025 11:30:05 +0100 Subject: [PATCH 24/46] reimplement compress/decompress --- .../weight_compression/torch_backend.py | 62 +++++-------------- 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 649eb779775..a20d4934f0c 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -40,7 +40,9 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType from nncf.torch.dynamic_graph.scope import Scope @@ -59,67 +61,33 @@ from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -def compress( - tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int -) -> torch.Tensor: - x = tensor / scale - if zero_point is not None: - x = x + zero_point - x = torch.round(x) - x = torch.clamp(x, min=level_low, max=level_high) - return x - - -def decompress(compressed: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None) -> torch.Tensor: - if zero_point is not None: - return (compressed - zero_point) * scale - return compressed * scale - - -def compress_decompress( - tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int -) -> torch.Tensor: - x = compress(tensor, scale, zero_point, level_low, level_high) - # Step 2: decompress - if zero_point is not None: - x = (x - zero_point) * scale - else: - x = x * scale - return x - - -def get_compress_fn(mode, num_bits: int): - asym_quant = mode in [CompressWeightsMode.INT4_ASYM] - level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) - level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - +def get_compress_fn(config): def _forward_fn(inputs): if len(inputs) == 3: tensor, scale, zero_point = inputs + tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point) else: tensor, scale = inputs + tensor, scale = Tensor(tensor), Tensor(scale) zero_point = None - with torch.no_grad(): - return compress(tensor, scale, zero_point, level_low, level_high) + quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) + return quantized.data return _forward_fn -def get_compress_decompress_fn(mode, num_bits: int): - asym_quant = mode in [CompressWeightsMode.INT4_ASYM] - level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) - level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - +def get_compress_decompress_fn(config): def _forward_fn(inputs): if len(inputs) == 3: tensor, scale, zero_point = inputs + tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point) else: tensor, scale = inputs + tensor, scale = Tensor(tensor), Tensor(scale) zero_point = None - with torch.no_grad(): - return compress_decompress( - tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high - ) + quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) + dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point) + return dequantized.data return _forward_fn @@ -278,11 +246,11 @@ def insert_adapters( @staticmethod def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): - return get_compress_decompress_fn(config.mode, config.num_bits) + return get_compress_decompress_fn(config) @staticmethod def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False): - return get_compress_fn(config.mode, config.num_bits) + return get_compress_fn(config) @staticmethod def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: From dcf88a5e646fb0204b3130581ef8dc3911f25b7e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Thu, 16 Jan 2025 11:50:11 +0100 Subject: [PATCH 25/46] rm fx --- .../weight_compression/mixed_precision.py | 6 +-- .../weight_compression/scale_estimation.py | 6 +-- .../weight_compression/torch_fx_backend.py | 51 ------------------- 3 files changed, 2 insertions(+), 61 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py index 6aa8a6b3c5e..2bab5e78f35 100644 --- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -211,7 +211,7 @@ class DataBasedCriterion(DataFreeCriterion, ABC): @property def available_backends(self) -> List[BackendType]: - return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX] + return [BackendType.OPENVINO, BackendType.TORCH] def _set_backend_entity(self, model: TModel) -> None: model_backend = get_backend(model) @@ -223,10 +223,6 @@ def _set_backend_entity(self, model: TModel) -> None: from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend self._backend_entity = PTMixedPrecisionAlgoBackend() - elif model_backend == BackendType.TORCH_FX: - from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXMixedPrecisionAlgoBackend - - self._backend_entity = FXMixedPrecisionAlgoBackend() else: raise nncf.UnsupportedBackendError( "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6fae3ed905f..40790f595b9 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -67,7 +67,7 @@ def __init__( @property def available_backends(self) -> List[BackendType]: - return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX] + return [BackendType.OPENVINO, BackendType.TORCH] def _set_backend_entity(self, model: TModel) -> None: """ @@ -84,10 +84,6 @@ def _set_backend_entity(self, model: TModel) -> None: from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend self._backend_entity = PTWeightCompressionAlgoBackend() - elif model_backend == BackendType.TORCH_FX: - from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend - - self._backend_entity = FXWeightCompressionAlgoBackend() else: raise nncf.UnsupportedBackendError( "Cannot return backend-specific Scale Estimation entity because {} is not supported!".format( diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 8b14f441669..8b57cf5f5c4 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -23,17 +23,10 @@ from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.transformations.layout import TransformationLayout from nncf.common.tensor_statistics.statistic_point import StatisticPoint -from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer -from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer -from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator from nncf.experimental.common.tensor_statistics.collectors import MeanReducer -from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer from nncf.experimental.common.tensor_statistics.collectors import TensorCollector -from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic -from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic -from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand from nncf.experimental.torch.fx.model_transformer import FXModelTransformer @@ -42,7 +35,6 @@ from nncf.experimental.torch.fx.transformations import constant_update_transformation_builder from nncf.experimental.torch.fx.transformations import module_insertion_transformation_builder from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -271,46 +263,3 @@ def transform_model( transformed_model = FXModelTransformer(model).transform(transformation_layout) return transformed_model - - -class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompressionAlgoBackend): - @staticmethod - def mean_variance_statistic_collector( - reduction_axes: Tuple[int], subset_size: Optional[int] = None - ) -> TensorCollector: - reducer = MeanVarianceReducer(reduction_axes) - aggregator = MeanAggregator(num_samples=subset_size) - collector = TensorCollector(MeanVarianceTensorStatistic) - collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator) - return collector - - @staticmethod - def max_variance_statistic_collector( - reduction_axes: Tuple[int], subset_size: Optional[int] = None - ) -> TensorCollector: - reducer = MaxVarianceReducer(reduction_axes) - aggregator = MeanAggregator(num_samples=subset_size) - collector = TensorCollector(MaxVarianceTensorStatistic) - collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator) - return collector - - @staticmethod - def mean_abs_max_statistic_collector( - reduction_axes: Tuple[int], subset_size: Optional[int] = None - ) -> TensorCollector: - reducer = MeanAbsMaxReducer(reduction_axes) - aggregator = MeanAggregator(num_samples=subset_size) - collector = TensorCollector(MeanMagnitudeTensorStatistic) - collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator) - return collector - - @staticmethod - def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: - def filter_func(point: StatisticPoint) -> bool: - return ( - algorithm_key in point.algorithm_to_tensor_collectors - and point.target_point.type - == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION] - ) - - return filter_func From e48a44b8296105dbb4ab47f5a843f94108b62bef Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 17 Jan 2025 16:22:29 +0100 Subject: [PATCH 26/46] add wc template --- nncf/torch/quantization/quantize_functions.py | 2 + .../template_test_weights_compression.py | 122 ++++++++++++++++++ tests/openvino/native/models.py | 4 +- .../quantization/test_weights_compression.py | 112 +++++----------- tests/torch/ptq/test_weights_compression.py | 61 +++++++++ 5 files changed, 218 insertions(+), 83 deletions(-) create mode 100644 tests/cross_fw/test_templates/template_test_weights_compression.py diff --git a/nncf/torch/quantization/quantize_functions.py b/nncf/torch/quantization/quantize_functions.py index 967a02dc256..debb4b5653b 100644 --- a/nncf/torch/quantization/quantize_functions.py +++ b/nncf/torch/quantization/quantize_functions.py @@ -292,6 +292,8 @@ def pack_uint4(tensor: torch.Tensor) -> torch.Tensor: if tensor.dtype != torch.uint8: raise ValidationError(f"Invalid tensor dtype {tensor.type}. torch.uint8 type is supported.") packed_tensor = tensor.contiguous() + # packed_tensor = packed_tensor.split(2, dim=-1) + # packed_tensor = packed_tensor packed_tensor = packed_tensor.reshape(-1, 2) packed_tensor = torch.bitwise_and(packed_tensor[..., ::2], 15) | packed_tensor[..., 1::2] << 4 return packed_tensor diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py new file mode 100644 index 00000000000..4d38e60090e --- /dev/null +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -0,0 +1,122 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from abc import ABC +from abc import abstractmethod +from typing import TypeVar + +import numpy as np +import pytest + +from nncf import CompressWeightsMode +from nncf import SensitivityMetric +from nncf.data.dataset import Dataset +from nncf.quantization import compress_weights +from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA +from nncf.tensor import TensorDataType + +TTensor = TypeVar("TTensor") + +NON_ZERO_ROW = [-4, 1, 2] +ACTIVATION = [[NON_ZERO_ROW, [0, 0, 0], [0, 0, 0]]] +MAX_VAR = 3.555555 # np.max(np.var(ACTIVATION, 1)) +MEAN_VAR = 1.555555 # np.mean(np.var(ACTIVATION, 1)) +MEAN_MAX = 2.333333 # np.mean(np.max(np.abs(ACTIVATION), 1)) +HESSIAN_TRACE = (16 + 1 + 4) * 2 / 9 # sum(i*i for i in NON_ZERO_ROW) * 2 / ACTIVATION.size +MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07 + + +class TemplateWeightCompression(ABC): + @staticmethod + @abstractmethod + def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor: + pass + + @abstractmethod + def get_matmul_model(self): + """Returns a model instance.""" + + @pytest.mark.parametrize( + ("mode", "ref_act_score", "ref_score"), + ( + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, HESSIAN_TRACE, 0), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, MEAN_MAX, MEAN_MAX * MAX_BASELINE_SCORE), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, MEAN_VAR, MEAN_VAR * MAX_BASELINE_SCORE), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, MAX_VAR, MAX_VAR * MAX_BASELINE_SCORE), + ), + ) + def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker): + model = self.get_matmul_model() + data = self.cast_to(self.to_tensor(ACTIVATION), dtype=TensorDataType.float32) + dataset = Dataset([data]) + criterion_cls = MIXED_PRECISION_CRITERIA.get(mode) + scores_spy = mocker.spy(criterion_cls, "_calc_sensitivity") + act_scores_spy = mocker.spy(criterion_cls, "_calc_activation_sensitivity") + + compress_weights( + model, + mode=CompressWeightsMode.INT4_ASYM, + ratio=0.5, + group_size=1, + dataset=dataset, + sensitivity_metric=mode, + all_layers=True, + ) + scores = scores_spy.spy_return + act_scores = act_scores_spy.spy_return + assert math.isclose(scores[0], ref_score, rel_tol=1e-05, abs_tol=1e-08) + assert math.isclose(ref_act_score, act_scores, rel_tol=1e-05, abs_tol=1e-08) + + @abstractmethod + def get_sequential_matmul_model(self): ... + + @abstractmethod + def to_tensor(): ... + + @abstractmethod + def check_weights(self, model, ref_ids): ... + + @pytest.mark.parametrize( + ("mode", "all_layers", "ratio", "ref_ids"), + ( + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), + ), + ) + def test_mixed_precision(self, mode, all_layers, ratio, ref_ids): + model = self.get_sequential_matmul_model() + first = self.to_tensor(np.ones([1, 4, 4], dtype=np.float32)) + second = self.to_tensor(np.arange(16, dtype=np.float32)).reshape(1, 4, 4) + dataset = Dataset([first, second]) + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.INT4_SYM, + ratio=ratio, + group_size=1, + all_layers=all_layers, + sensitivity_metric=mode, + dataset=dataset, + ) + self.check_weights(compressed_model, ref_ids) diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index df83d366a9e..bbbde714d39 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -798,12 +798,12 @@ class SequentialMatmulModel(OVReferenceModel): """ def _create_ov_model(self): - input_node = opset.parameter([1, 3, 3], name="Input_1") + input_node = opset.parameter([1, 4, 4], name="Input_1") main_values = [10000, 1000, 1, 10, 10000] last_node = input_node for i, main_value in enumerate(main_values): - weights_data = np.arange(0, 9).reshape(3, 3) + weights_data = np.arange(0, 16).reshape(4, 4) weights_data[-1, -1] = main_value current_weights = opset.constant(weights_data, dtype=np.float32, name=f"weights_{i}") current_node = opset.matmul( diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index e39a621a4a8..a2b4fad8d03 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -47,6 +47,8 @@ from tests.cross_fw.shared.comparator import compare_stats from tests.cross_fw.shared.json import dump_to_json from tests.cross_fw.shared.json import load_json +from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION +from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression from tests.openvino.native.common import get_actual_reference_for_current_openvino from tests.openvino.native.models import AWQActMatmulModel from tests.openvino.native.models import AWQMatmulModel @@ -263,46 +265,6 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): compare_stats(ref_stats, actual_stats) -@pytest.mark.parametrize( - ("mode", "all_layers", "ratio", "ref_ids"), - ( - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), - (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]), - (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), - ), -) -def test_mixed_precision(mode, all_layers, ratio, ref_ids, mocker): - model = SequentialMatmulModel().ov_model - dataset = Dataset([np.ones([1, 3, 3]), np.arange(9).reshape(1, 3, 3)]) - compressed_model = compress_weights( - model, - mode=CompressWeightsMode.NF4, - ratio=ratio, - group_size=1, - all_layers=all_layers, - sensitivity_metric=mode, - dataset=dataset, - ) - names = { - op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.nf4 - } - ref_nf4_nodes = {f"weights_{i}" for i in ref_ids} - assert ref_nf4_nodes == names - - @pytest.mark.parametrize("metric", DATA_BASED_SENSITIVITY_METRICS) def test_gather_in_4_bit_if_all_layers_with_data(metric): dim1 = 2 # sequence length dimension @@ -430,46 +392,6 @@ def test_gather_in_8_bit_if_not_all_layers(metric): assert node.get_element_type() == ov.Type.u8 -MAX_BASELINE_SCORE = 1 / np.finfo(np.float32).eps -NON_ZERO_ROW = [-4, 1, 2] -ACTIVATION = np.array([[NON_ZERO_ROW, [0, 0, 0], [0, 0, 0]]]) -MAX_VAR = 3.555555 # np.max(np.var(ACTIVATION, 1)) -MEAN_VAR = 1.555555 # np.mean(np.var(ACTIVATION, 1)) -MEAN_MAX = 2.333333 # np.mean(np.max(np.abs(ACTIVATION), 1)) -HESSIAN_TRACE = (16 + 1 + 4) * 2 / 9 # sum(i*i for i in NON_ZERO_ROW) * 2 / ACTIVATION.size - - -@pytest.mark.parametrize( - ("mode", "ref_act_scores", "ref_scores"), - ( - (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, HESSIAN_TRACE, 0), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, MEAN_MAX, MEAN_MAX * MAX_BASELINE_SCORE), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, MEAN_VAR, MEAN_VAR * MAX_BASELINE_SCORE), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, MAX_VAR, MAX_VAR * MAX_BASELINE_SCORE), - ), -) -def test_data_based_criterion(mode, ref_scores, ref_act_scores, mocker): - model = IdentityMatmul().ov_model - dataset = Dataset([ACTIVATION]) - criterion_cls = MIXED_PRECISION_CRITERIA.get(mode) - scores_spy = mocker.spy(criterion_cls, "_calc_sensitivity") - act_scores_spy = mocker.spy(criterion_cls, "_calc_activation_sensitivity") - - compress_weights( - model, - mode=CompressWeightsMode.NF4, - ratio=0.5, - group_size=1, - dataset=dataset, - sensitivity_metric=mode, - all_layers=True, - ) - scores = scores_spy.spy_return - act_scores = act_scores_spy.spy_return - assert np.allclose(scores, ref_scores) - assert np.allclose(act_scores, ref_act_scores) - - @pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM)) def test_quantize_Gather_with_multiple_reduction_axes_in_8bit(mode): model = GatherWithTwoReductionAxes().ov_model @@ -1055,7 +977,7 @@ def test_call_gptq(mode): ) def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): model = SequentialMatmulModel().ov_model - dataset = Dataset([np.ones([1, 3, 3]), np.arange(9).reshape(3, 3)]) + dataset = Dataset([np.ones([1, 4, 4]), np.arange(16).reshape(4, 4)]) compressed_model = compress_weights( model, mode=CompressWeightsMode.E2M1, @@ -1596,3 +1518,31 @@ def test_compression_with_transposed_activations(kwargs): all_layers=True, **kwargs, ) + + +class TestOVTemplateWeightCompression(TemplateWeightCompression): + @staticmethod + def get_matmul_model(): + return IdentityMatmul().ov_model + + @staticmethod + def get_sequential_matmul_model(): + return SequentialMatmulModel().ov_model + + @staticmethod + def to_tensor(x): + return np.array(x) + + @staticmethod + def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray: + if dtype is TensorDataType.float32: + return x.astype(np.float32) + if dtype is TensorDataType.float16: + return x.astype(np.float16) + raise NotImplementedError + + @staticmethod + def check_weights(model, ref_ids): + names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4} + ref_nf4_nodes = {f"weights_{i}" for i in ref_ids} + assert ref_nf4_nodes == names diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 979326cfb36..d9c0dc2b2f2 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -11,6 +11,7 @@ import pytest import torch +import torch.nn as nn import torch.nn.functional as F import nncf @@ -19,6 +20,7 @@ from nncf import SensitivityMetric from nncf.quantization import compress_weights from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +from nncf.tensor import TensorDataType from nncf.torch import wrap_model from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor from nncf.torch.quantization.layers import INT4SymmetricWeightsDecompressor @@ -28,7 +30,9 @@ from nncf.torch.quantization.quantize_functions import pack_uint4 from nncf.torch.quantization.quantize_functions import unpack_int4 from nncf.torch.quantization.quantize_functions import unpack_uint4 +from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression from tests.torch.test_models.synthetic import ShortTransformer +from tests.torch.test_tensor import cast_to ALL_SENSITIVITY_METRICS = list(SensitivityMetric) @@ -318,3 +322,60 @@ def test_pack_int4(): assert packed_w.numel() * 2 == w_int8.numel() unpacked_w = unpack_int4(packed_w).reshape(w_int8.shape) assert torch.all(unpacked_w == w_int8) + + +class IdentityMatmul(torch.nn.Module): + def __init__(self): + super().__init__() + self.w = torch.nn.Parameter( + torch.eye(3, dtype=torch.float32) * 255, + ) + + def forward(self, input): + return input @ self.w + + +class SequentialMatmulModel(nn.Module): + def __init__(self): + super(SequentialMatmulModel, self).__init__() + self.main_values = [10000, 1000, 1, 10, 10000] + self.layers = nn.ModuleList() + + for _, main_value in enumerate(self.main_values): + weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4) + weights_data[-1, -1] = main_value + weight_tensor = torch.tensor(weights_data) + layer = nn.Linear(4, 4, bias=False) + layer.weight = nn.Parameter(weight_tensor.t()) + self.layers.append(layer) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class TestPTTemplateWeightCompression(TemplateWeightCompression): + @staticmethod + def get_matmul_model(): + return IdentityMatmul() + + @staticmethod + def get_sequential_matmul_model(): + return SequentialMatmulModel() + + @staticmethod + def to_tensor(t): + return torch.tensor(t) + + @staticmethod + def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: + return cast_to(x, dtype) + + @staticmethod + def check_weights(model, ref_ids): + for i, op in enumerate(model.layers): + if i in ref_ids: + assert torch.numel(op.weight) == 8 # workaround to detect uint4 weights + else: + assert torch.numel(op.weight) == 16 From 63e8c0ad07ff128363f2b3d29fec37831a99ab4a Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 17 Jan 2025 16:31:01 +0100 Subject: [PATCH 27/46] polishing --- .../template_test_weights_compression.py | 20 +++++++++++++------ .../quantization/test_weights_compression.py | 12 +++++------ tests/torch/ptq/test_weights_compression.py | 10 ++++++---- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 4d38e60090e..a71d0f7d24a 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -11,7 +11,7 @@ import math from abc import ABC from abc import abstractmethod -from typing import TypeVar +from typing import List, TypeVar import numpy as np import pytest @@ -23,6 +23,7 @@ from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.tensor import TensorDataType +TModel = TypeVar("TModel") TTensor = TypeVar("TTensor") NON_ZERO_ROW = [-4, 1, 2] @@ -41,8 +42,10 @@ def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor: pass @abstractmethod - def get_matmul_model(self): - """Returns a model instance.""" + def get_matmul_model() -> TModel: + """ + Returns a backend model for test_data_based_criterion. + """ @pytest.mark.parametrize( ("mode", "ref_act_score", "ref_score"), @@ -76,13 +79,18 @@ def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker): assert math.isclose(ref_act_score, act_scores, rel_tol=1e-05, abs_tol=1e-08) @abstractmethod - def get_sequential_matmul_model(self): ... + def get_sequential_matmul_model() -> TModel: + """ + Returns a backend model for test_mixed_precision. + """ @abstractmethod - def to_tensor(): ... + def to_tensor(x: TTensor) -> TTensor: + pass @abstractmethod - def check_weights(self, model, ref_ids): ... + def check_weights(model: TModel, ref_ids: List[int]) -> None: + """Checks that only weights with specified ids are compressed in int4 format.""" @pytest.mark.parametrize( ("mode", "all_layers", "ratio", "ref_ids"), diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index a2b4fad8d03..aedfc1d0573 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -874,7 +874,7 @@ def test_number_of_reduced_statistics_for_subset_size( mocker, dataset_size, subset_size, ref_size, compression_args, multiplier_of_calls ): model = IdentityMatmul().ov_model - dataset = Dataset([ACTIVATION] * dataset_size) + dataset = Dataset([np.array(ACTIVATION)] * dataset_size) stats_spy = mocker.spy(AggregatorBase, "register_reduced_input") compress_weights(model, dataset=dataset, subset_size=subset_size, **compression_args) @@ -890,7 +890,7 @@ def test_default_subset_value(): @pytest.mark.parametrize("subset_size", (-1, 0)) def test_invalid_subset_size(subset_size): model = IdentityMatmul().ov_model - dataset = Dataset([ACTIVATION]) + dataset = Dataset([np.array(ACTIVATION)]) with pytest.raises(nncf.ValidationError): compress_weights(model, mode=CompressWeightsMode.INT4_ASYM, ratio=0.5, dataset=dataset, subset_size=subset_size) @@ -1522,15 +1522,15 @@ def test_compression_with_transposed_activations(kwargs): class TestOVTemplateWeightCompression(TemplateWeightCompression): @staticmethod - def get_matmul_model(): + def get_matmul_model() -> ov.Model: return IdentityMatmul().ov_model @staticmethod - def get_sequential_matmul_model(): + def get_sequential_matmul_model() -> ov.Model: return SequentialMatmulModel().ov_model @staticmethod - def to_tensor(x): + def to_tensor(x) -> np.ndarray: return np.array(x) @staticmethod @@ -1542,7 +1542,7 @@ def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray: raise NotImplementedError @staticmethod - def check_weights(model, ref_ids): + def check_weights(model: ov.Model, ref_ids: List[int]) -> None: names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4} ref_nf4_nodes = {f"weights_{i}" for i in ref_ids} assert ref_nf4_nodes == names diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index d9c0dc2b2f2..06047849ba1 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -9,6 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List + import pytest import torch import torch.nn as nn @@ -357,15 +359,15 @@ def forward(self, x): class TestPTTemplateWeightCompression(TemplateWeightCompression): @staticmethod - def get_matmul_model(): + def get_matmul_model() -> torch.nn.Module: return IdentityMatmul() @staticmethod - def get_sequential_matmul_model(): + def get_sequential_matmul_model() -> torch.nn.Module: return SequentialMatmulModel() @staticmethod - def to_tensor(t): + def to_tensor(t) -> torch.Tensor: return torch.tensor(t) @staticmethod @@ -373,7 +375,7 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: return cast_to(x, dtype) @staticmethod - def check_weights(model, ref_ids): + def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: for i, op in enumerate(model.layers): if i in ref_ids: assert torch.numel(op.weight) == 8 # workaround to detect uint4 weights From b2fef75fa5e071ef7a931da51a1839880bd0a631 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 17 Jan 2025 16:39:29 +0100 Subject: [PATCH 28/46] comment --- .../test_reducers_and_aggregators.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py index 7d60f0fc01a..fd1b959732d 100644 --- a/tests/common/experimental/test_reducers_and_aggregators.py +++ b/tests/common/experimental/test_reducers_and_aggregators.py @@ -55,6 +55,7 @@ NO_OUTLIERS_DEFAULT_3D_MEDIAN_VALUE = [[5.0, 4.0, 15.0], [8.0, 25.0, 12.0], [35.0, 16.0, 45.0]] +WEIGHT_COMPRESSION_REDUCERS_DATA = [[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]] default_test_quantile = 0.1 @@ -238,10 +239,10 @@ def test_quantile_reducers(self, reducer_name, ref, reducers): @pytest.mark.parametrize( "axes, np_data, reference", [ - [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], - [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 14.25], - [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 15.875], - [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], + [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 14.25], + [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 15.875], + [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], ], ) def test_mean_variance_reducer(self, axes, np_data, reference): @@ -254,10 +255,10 @@ def test_mean_variance_reducer(self, axes, np_data, reference): @pytest.mark.parametrize( "axes, np_data, reference", [ - [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0], - [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 4.16666], - [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 6.33333], - [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0], + [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0], + [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 4.16666], + [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 6.33333], + [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0], ], ) def test_mean_abs_max_reducer(self, axes, np_data, reference): @@ -270,10 +271,10 @@ def test_mean_abs_max_reducer(self, axes, np_data, reference): @pytest.mark.parametrize( "axes, np_data, reference", [ - [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], - [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 64.0], - [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 36.1875], - [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666], + [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], + [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 64.0], + [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 36.1875], + [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], ], ) def test_max_variance_reducer(self, axes, np_data, reference): @@ -582,8 +583,7 @@ def test_reducers_name_hash_equal(self, reducer_name, reducers): params["channel_axis"] = [1, 2] else: raise nncf.ValidationError( - "test_min_max_mean_reducer_hash_equal configurated in a wrong way." - f" Wrong reducer_name: {reducer_name}" + f"test_min_max_mean_reducer_hash_equal configurated in a wrong way. Wrong reducer_name: {reducer_name}" ) def product_dict(**kwargs): From 32788a4f46b154104dbc0f07457ace7f70d6abd9 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 17 Jan 2025 17:18:29 +0100 Subject: [PATCH 29/46] comments --- nncf/torch/quantization/quantize_functions.py | 2 - .../test_reducers_and_aggregators.py | 39 ++++++------------- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/nncf/torch/quantization/quantize_functions.py b/nncf/torch/quantization/quantize_functions.py index debb4b5653b..967a02dc256 100644 --- a/nncf/torch/quantization/quantize_functions.py +++ b/nncf/torch/quantization/quantize_functions.py @@ -292,8 +292,6 @@ def pack_uint4(tensor: torch.Tensor) -> torch.Tensor: if tensor.dtype != torch.uint8: raise ValidationError(f"Invalid tensor dtype {tensor.type}. torch.uint8 type is supported.") packed_tensor = tensor.contiguous() - # packed_tensor = packed_tensor.split(2, dim=-1) - # packed_tensor = packed_tensor packed_tensor = packed_tensor.reshape(-1, 2) packed_tensor = torch.bitwise_and(packed_tensor[..., ::2], 15) | packed_tensor[..., 1::2] << 4 return packed_tensor diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py index fd1b959732d..334640f996e 100644 --- a/tests/common/experimental/test_reducers_and_aggregators.py +++ b/tests/common/experimental/test_reducers_and_aggregators.py @@ -237,49 +237,34 @@ def test_quantile_reducers(self, reducer_name, ref, reducers): assert fns.allclose(val[i], self.get_nncf_tensor(ref_)) @pytest.mark.parametrize( - "axes, np_data, reference", - [ - [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], - [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 14.25], - [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 15.875], - [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], - ], + "axes, reference", + [[None, 16.1666], [(0,), 14.25], [(0, 1), 15.875], [(0, 1, 2), 16.1666]], ) - def test_mean_variance_reducer(self, axes, np_data, reference): + def test_mean_variance_reducer(self, axes, reference): reducer = MeanVarianceReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) + nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) @pytest.mark.parametrize( - "axes, np_data, reference", - [ - [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0], - [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 4.16666], - [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 6.33333], - [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0], - ], + "axes, reference", + [[None, 10.0], [(0,), 4.16666], [(0, 1), 6.33333], [(0, 1, 2), 10.0]], ) - def test_mean_abs_max_reducer(self, axes, np_data, reference): + def test_mean_abs_max_reducer(self, axes, reference): reducer = MeanAbsMaxReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) + nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) @pytest.mark.parametrize( - "axes, np_data, reference", - [ - [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], - [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 64.0], - [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 36.1875], - [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666], - ], + "axes, reference", + [[None, 16.1666], [(0,), 64.0], [(0, 1), 36.1875], [(0, 1, 2), 16.1666]], ) - def test_max_variance_reducer(self, axes, np_data, reference): + def test_max_variance_reducer(self, axes, reference): reducer = MaxVarianceReducer(reduction_axes=axes) - nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT) + nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT) result = reducer._reduce_out_of_place([nncf_data]) assert len(result) == 1 assert fns.allclose(result[0], self.get_nncf_tensor(reference)) From 9d0acdbd2685c88d3f1caf2804e2d57e9762857e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Mon, 20 Jan 2025 11:26:42 +0100 Subject: [PATCH 30/46] rollback no_grad --- .../weight_compression/torch_backend.py | 33 +++++++++---------- nncf/torch/engine.py | 11 +++---- .../post_training/data/wc_reference_data.yaml | 2 +- 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index a20d4934f0c..518be41721d 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -61,30 +61,27 @@ from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -def get_compress_fn(config): - def _forward_fn(inputs): - if len(inputs) == 3: - tensor, scale, zero_point = inputs - tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point) - else: - tensor, scale = inputs - tensor, scale = Tensor(tensor), Tensor(scale) - zero_point = None +def _prepare_inputs( + tensor: torch.Tensor, scale: torch.Tensor, zero_point=Optional[torch.Tensor] +) -> Tuple[Tensor, Tensor, Optional[Tensor]]: + tensor, scale = Tensor(tensor), Tensor(scale) + if zero_point is not None: + zero_point = Tensor(zero_point) + return tensor, scale, zero_point + + +def get_compress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]: + def _forward_fn(inputs: Tuple) -> Tensor: + tensor, scale, zero_point = _prepare_inputs(*inputs) quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) return quantized.data return _forward_fn -def get_compress_decompress_fn(config): - def _forward_fn(inputs): - if len(inputs) == 3: - tensor, scale, zero_point = inputs - tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point) - else: - tensor, scale = inputs - tensor, scale = Tensor(tensor), Tensor(scale) - zero_point = None +def get_compress_decompress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]: + def _forward_fn(inputs: Tuple) -> Tensor: + tensor, scale, zero_point = _prepare_inputs(*inputs) quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point) return dequantized.data diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py index 27b7cc5e706..239c6857c24 100644 --- a/nncf/torch/engine.py +++ b/nncf/torch/engine.py @@ -44,10 +44,9 @@ def infer( :param input_data: Inputs for the model. :return: Model outputs. """ - with torch.no_grad(): - if isinstance(input_data, dict): - return self._model(**input_data) - if isinstance(input_data, tuple): - return self._model(*input_data) + if isinstance(input_data, dict): + return self._model(**input_data) + if isinstance(input_data, tuple): + return self._model(*input_data) - return self._model(input_data) + return self._model(input_data) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 8d47e097953..3d27d81ee20 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -36,7 +36,7 @@ tinyllama_scale_estimation_per_channel_backend_OV: num_int4: 188 num_int8: 124 tinyllama_scale_estimation_per_channel_backend_TORCH: - metric_value: 0.80799 + metric_value: 0.81389 num_int4: 188 num_int8: 124 tinyllama_data_aware_lora_stateful_backend_OV: From 37a41ac6feb4ffb768933214e1312e2ac624570e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Mon, 20 Jan 2025 12:59:29 +0100 Subject: [PATCH 31/46] add torch.no_grad() --- .../torch/fx/quantization/quantize_model.py | 40 +++++++++---------- nncf/torch/quantization/quantize_model.py | 36 ++++++++--------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/nncf/experimental/torch/fx/quantization/quantize_model.py b/nncf/experimental/torch/fx/quantization/quantize_model.py index 3d5c64dfccd..ef2aac355ca 100644 --- a/nncf/experimental/torch/fx/quantization/quantize_model.py +++ b/nncf/experimental/torch/fx/quantization/quantize_model.py @@ -135,25 +135,25 @@ def compress_weights_impl( """ Implementation of the `compress_weights()` method for the Torch Fx backend. """ - - compression_algorithm = WeightCompression( - mode, - ratio, - group_size, - ignored_scope, - all_layers, - sensitivity_metric, - awq, - subset_size, - scale_estimation, - gptq, - lora_correction, - backup_mode, - advanced_parameters, - ) - graph = NNCFGraphFactory.create(model) - compressed_model = compression_algorithm.apply(model, graph, dataset=dataset) - compressed_model = GraphModule(compressed_model, compressed_model.graph) - compressed_model = _disallow_eval_train(compressed_model) + with torch.no_grad(): + compression_algorithm = WeightCompression( + mode, + ratio, + group_size, + ignored_scope, + all_layers, + sensitivity_metric, + awq, + subset_size, + scale_estimation, + gptq, + lora_correction, + backup_mode, + advanced_parameters, + ) + graph = NNCFGraphFactory.create(model) + compressed_model = compression_algorithm.apply(model, graph, dataset=dataset) + compressed_model = GraphModule(compressed_model, compressed_model.graph) + compressed_model = _disallow_eval_train(compressed_model) return compressed_model diff --git a/nncf/torch/quantization/quantize_model.py b/nncf/torch/quantization/quantize_model.py index 3e5c9af0ce4..ddc97c13d90 100644 --- a/nncf/torch/quantization/quantize_model.py +++ b/nncf/torch/quantization/quantize_model.py @@ -101,21 +101,21 @@ def compress_weights_impl( """ Implementation of the `compress_weights()` method for the PyTorch backend. """ - - compression_algorithm = WeightCompression( - mode, - ratio, - group_size, - ignored_scope, - all_layers, - sensitivity_metric, - awq, - subset_size, - scale_estimation, - gptq, - lora_correction, - backup_mode, - advanced_parameters, - ) - graph = NNCFGraphFactory.create(model) - return compression_algorithm.apply(model, graph, dataset=dataset) + with torch.no_grad(): + compression_algorithm = WeightCompression( + mode, + ratio, + group_size, + ignored_scope, + all_layers, + sensitivity_metric, + awq, + subset_size, + scale_estimation, + gptq, + lora_correction, + backup_mode, + advanced_parameters, + ) + graph = NNCFGraphFactory.create(model) + return compression_algorithm.apply(model, graph, dataset=dataset) From a305fac031370bab353dd7a70f61dce33827135c Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Mon, 20 Jan 2025 14:34:29 +0100 Subject: [PATCH 32/46] start of cuda in conformance --- .../pipelines/lm_weight_compression.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index b72e9fb632e..b9ce4eca3e2 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -85,7 +85,9 @@ def prepare_model(self) -> None: raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.") self.model_hf = AutoModelForCausalLM.from_pretrained( - self.model_id, torch_dtype=torch.float32, device_map="cpu" + self.model_id, + torch_dtype=torch.float32, + device_map="cpu", # TODO (kshpv): add support of 'cuda', when supported ) self.model = self.model_hf elif self.backend == BackendType.OV: @@ -157,7 +159,7 @@ def transform_fn(data, max_tokens=128, filter_bad_tokens=True): inputs[name] = np.zeros(shape) if self.backend == BackendType.TORCH: for input_name in inputs: - inputs[input_name] = torch.from_numpy(inputs[input_name]) + inputs[input_name] = torch.from_numpy(inputs[input_name]).to(self.model_hf.device) return inputs return transform_fn @@ -209,7 +211,13 @@ def save_compressed_model(self) -> None: ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) self.model_hf._save_config(self.output_model_dir) elif self.backend == BackendType.TORCH: - export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32") + export_from_model( + self.model_hf, + self.output_model_dir, + stateful=False, + compression_option="fp32", + device=self.model_hf.device, + ) def get_num_compressed(self) -> None: """ From ddee49532fdf5705fe5d1e6416621c7d8b2d375f Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 21 Jan 2025 11:53:40 +0100 Subject: [PATCH 33/46] add scale estimation test --- .../template_test_weights_compression.py | 51 ++++++-- tests/openvino/native/models.py | 15 +++ .../quantization/test_weights_compression.py | 44 +++++++ .../fx/test_weights_compression_backends.py | 38 ++++++ tests/torch/ptq/test_weights_compression.py | 110 ++++++++++++------ 5 files changed, 216 insertions(+), 42 deletions(-) create mode 100644 tests/torch/fx/test_weights_compression_backends.py diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index a71d0f7d24a..d5bdf386300 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -16,11 +16,14 @@ import numpy as np import pytest +import nncf.tensor.functions as fns from nncf import CompressWeightsMode from nncf import SensitivityMetric from nncf.data.dataset import Dataset from nncf.quantization import compress_weights from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA +from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation +from nncf.tensor import Tensor from nncf.tensor import TensorDataType TModel = TypeVar("TModel") @@ -39,13 +42,11 @@ class TemplateWeightCompression(ABC): @staticmethod @abstractmethod def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor: - pass + """Casts a backend tensor to backend tensor with specified dtype.""" @abstractmethod def get_matmul_model() -> TModel: - """ - Returns a backend model for test_data_based_criterion. - """ + """Returns a backend model for test_data_based_criterion.""" @pytest.mark.parametrize( ("mode", "ref_act_score", "ref_score"), @@ -80,13 +81,11 @@ def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker): @abstractmethod def get_sequential_matmul_model() -> TModel: - """ - Returns a backend model for test_mixed_precision. - """ + """Returns a backend model for test_mixed_precision.""" @abstractmethod def to_tensor(x: TTensor) -> TTensor: - pass + """Returns a backend tensor.""" @abstractmethod def check_weights(model: TModel, ref_ids: List[int]) -> None: @@ -128,3 +127,39 @@ def test_mixed_precision(self, mode, all_layers, ratio, ref_ids): dataset=dataset, ) self.check_weights(compressed_model, ref_ids) + + @staticmethod + @abstractmethod + def get_model_for_test_scale_estimation(): + """ + Returns a backend model for test_scale_estimation. + """ + + @staticmethod + @abstractmethod + def get_scale_estimation_ref(): + """ + Returns the reference output of calculate_quantization_params of ScaleEstimation. + """ + + def test_scale_estimation(self, mocker): + calc_q_params_spy = mocker.spy(ScaleEstimation, "calculate_quantization_params") + model = self.get_model_for_test_scale_estimation() + + # prepare dataset with one input tensor + input = np.arange(0, 32 * 32, dtype=np.float32).reshape(1, 32, 32) + input[0, 15] *= 100 # make one channel relatively higher. + input = self.to_tensor(input) + dataset = Dataset([input]) + + _ = compress_weights( + model, + mode=CompressWeightsMode.INT4_ASYM, + ratio=1.0, + group_size=32, + scale_estimation=True, + all_layers=True, + dataset=dataset, + ) + reference = self.get_scale_estimation_ref() + assert fns.allclose(Tensor(reference), calc_q_params_spy.spy_return[0]) diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index bbbde714d39..56ef047f97f 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1185,3 +1185,18 @@ def _create_ov_model(self): model = ov.Model([sin_result, cos_result], [position_ids]) return model + + +class MLP(OVReferenceModel): + def _create_ov_model(self): + input_node = opset.parameter([1, 32, 32], name="Input") + + weights_data = np.arange(0, 32 * 32, dtype=np.float32).reshape(32, 32) + weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights") + + matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul") + + result_node = opset.result(matmul_node, name="Result") + + model = ov.Model([result_node], [input_node], name="MLP_Model") + return model diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index aedfc1d0573..e0d072cb04a 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -50,6 +50,7 @@ from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression from tests.openvino.native.common import get_actual_reference_for_current_openvino +from tests.openvino.native.models import MLP from tests.openvino.native.models import AWQActMatmulModel from tests.openvino.native.models import AWQMatmulModel from tests.openvino.native.models import GatherAndMatmulShareData @@ -1546,3 +1547,46 @@ def check_weights(model: ov.Model, ref_ids: List[int]) -> None: names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4} ref_nf4_nodes = {f"weights_{i}" for i in ref_ids} assert ref_nf4_nodes == names + + @staticmethod + def get_model_for_test_scale_estimation(): + return MLP().ov_model + + @staticmethod + def get_scale_estimation_ref(): + return np.array( + [ + [[2.0666666]], + [[3.7624273]], + [[5.884783]], + [[8.03606]], + [[10.136832]], + [[12.291862]], + [[14.34415]], + [[16.449669]], + [[18.608639]], + [[20.802698]], + [[22.9477]], + [[25.083504]], + [[27.152409]], + [[29.141987]], + [[31.171442]], + [[33.044716]], + [[35.178047]], + [[37.31138]], + [[39.444714]], + [[41.578045]], + [[43.71138]], + [[45.844715]], + [[47.978046]], + [[50.11138]], + [[52.244713]], + [[54.378044]], + [[56.511383]], + [[58.644714]], + [[60.77805]], + [[62.91138]], + [[65.044716]], + [[67.17805]], + ] + ) diff --git a/tests/torch/fx/test_weights_compression_backends.py b/tests/torch/fx/test_weights_compression_backends.py new file mode 100644 index 00000000000..bd66093ec19 --- /dev/null +++ b/tests/torch/fx/test_weights_compression_backends.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nncf.quantization.algorithms.weight_compression.mixed_precision import HAWQCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MaxVarianceCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanMaxCriterion +from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanVarianceCriterion +from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend +from tests.cross_fw.test_templates.test_weights_compression_backends import TemplateTestMixedPrecisionAlgoBackend + + +class TestPTMixedPrecisionAlgoBackend(TemplateTestMixedPrecisionAlgoBackend): + def get_hawq_with_backend(self, subset_size): + hawq = HAWQCriterion(None, None, subset_size=subset_size) + hawq._backend_entity = PTMixedPrecisionAlgoBackend() + return hawq + + def get_mean_variance_with_backend(self, subset_size: int): + mean_variance = MeanVarianceCriterion(None, None, subset_size=subset_size) + mean_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return mean_variance + + def get_max_variance_with_backend(self, subset_size: int): + max_variance = MaxVarianceCriterion(None, None, subset_size=subset_size) + max_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return max_variance + + def get_mean_max_with_backend(self, subset_size: int): + mean_max_variance = MeanMaxCriterion(None, None, subset_size=subset_size) + mean_max_variance._backend_entity = PTMixedPrecisionAlgoBackend() + return mean_max_variance diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 06047849ba1..cee43f783d0 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -44,15 +44,45 @@ UNSUPPORTED_MODES = (CompressWeightsMode.NF4, CompressWeightsMode.E2M1) -class MatMulModel(torch.nn.Module): +class SequentialMatmulModel(nn.Module): def __init__(self): + super(SequentialMatmulModel, self).__init__() + self.main_values = [10000, 1000, 1, 10, 10000] + self.layers = nn.ModuleList() + + for _, main_value in enumerate(self.main_values): + weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4) + weights_data[-1, -1] = main_value + weight_tensor = torch.tensor(weights_data) + layer = nn.Linear(4, 4, bias=False) + layer.weight = nn.Parameter(weight_tensor.t()) + self.layers.append(layer) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class MatMulModel(torch.nn.Module): + def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)): super().__init__() - self.w = torch.nn.Parameter(torch.ones(size=(256, 256), dtype=torch.float32)) + self.w = torch.nn.Parameter(weight) def forward(self, input): return input @ self.w +class LinearModel(torch.nn.Module): + def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)): + super().__init__() + self.linear = torch.nn.Linear(weight.shape[0], weight.shape[1], False) + self.linear.weight = torch.nn.Parameter(weight) + + def forward(self, input): + return self.linear(input) + + class FunctionalModel(torch.nn.Module): def __init__(self): super().__init__() @@ -326,41 +356,10 @@ def test_pack_int4(): assert torch.all(unpacked_w == w_int8) -class IdentityMatmul(torch.nn.Module): - def __init__(self): - super().__init__() - self.w = torch.nn.Parameter( - torch.eye(3, dtype=torch.float32) * 255, - ) - - def forward(self, input): - return input @ self.w - - -class SequentialMatmulModel(nn.Module): - def __init__(self): - super(SequentialMatmulModel, self).__init__() - self.main_values = [10000, 1000, 1, 10, 10000] - self.layers = nn.ModuleList() - - for _, main_value in enumerate(self.main_values): - weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4) - weights_data[-1, -1] = main_value - weight_tensor = torch.tensor(weights_data) - layer = nn.Linear(4, 4, bias=False) - layer.weight = nn.Parameter(weight_tensor.t()) - self.layers.append(layer) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - return x - - class TestPTTemplateWeightCompression(TemplateWeightCompression): @staticmethod def get_matmul_model() -> torch.nn.Module: - return IdentityMatmul() + return MatMulModel(255 * torch.eye(3, dtype=torch.float32)) @staticmethod def get_sequential_matmul_model() -> torch.nn.Module: @@ -381,3 +380,46 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: assert torch.numel(op.weight) == 8 # workaround to detect uint4 weights else: assert torch.numel(op.weight) == 16 + + @staticmethod + def get_model_for_test_scale_estimation(): + return LinearModel(torch.arange(0, 32 * 32, dtype=torch.float32).reshape(32, 32)) + + @staticmethod + def get_scale_estimation_ref(): + return torch.tensor( + [ + [[2.0666666]], + [[3.7624271]], + [[5.8847833]], + [[8.0360603]], + [[10.1368332]], + [[12.2918606]], + [[14.3441496]], + [[16.4496689]], + [[18.6086369]], + [[20.8027000]], + [[22.9477024]], + [[25.0835018]], + [[27.1524105]], + [[29.1419849]], + [[31.1714401]], + [[33.0447121]], + [[35.1780472]], + [[37.3113823]], + [[39.4447136]], + [[41.5780487]], + [[43.7113838]], + [[45.8447189]], + [[47.9780464]], + [[50.1113815]], + [[52.2447128]], + [[54.3780441]], + [[56.5113831]], + [[58.6447144]], + [[60.7780533]], + [[62.9113808]], + [[65.0447083]], + [[67.1780548]], + ] + ) From f89ae9d7ede3174122bd0deb9df832b1b7cc4578 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 21 Jan 2025 11:55:45 +0100 Subject: [PATCH 34/46] upd year --- tests/torch/fx/test_weights_compression_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/torch/fx/test_weights_compression_backends.py b/tests/torch/fx/test_weights_compression_backends.py index bd66093ec19..fccdd0e2c01 100644 --- a/tests/torch/fx/test_weights_compression_backends.py +++ b/tests/torch/fx/test_weights_compression_backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 026a0ed8ff57efefc94f162d5d8d56c9162445fc Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 21 Jan 2025 14:07:30 +0100 Subject: [PATCH 35/46] add tinyllama_scale_estimation_group_size_64 --- tests/post_training/data/wc_reference_data.yaml | 8 ++++++++ tests/post_training/model_scope.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 3d27d81ee20..1309dbbc13c 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -56,3 +56,11 @@ tinyllama_int4_data_free_backend_TORCH: metric_value: 0.73873 num_int4: 114 num_int8: 84 +tinyllama_scale_estimation_group_size_64_backend_OV: + metric_value: 0.8566 + num_int4: 94 + num_int8: 124 +tinyllama_scale_estimation_group_size_64_backend_TORCH: + metric_value: 0.8566 + num_int4: 94 + num_int8: 124 \ No newline at end of file diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py index ad2dd36757d..b9e07c77fab 100644 --- a/tests/post_training/model_scope.py +++ b/tests/post_training/model_scope.py @@ -531,6 +531,21 @@ }, "backends": [BackendType.OV], }, + { + "reported_name": "tinyllama_scale_estimation_group_size_64", + "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b", + "pipeline_cls": LMWeightCompression, + "compression_params": { + "group_size": 64, + "ratio": 0.8, + "mode": CompressWeightsMode.INT4_SYM, + "scale_estimation": True, + "advanced_parameters": AdvancedCompressionParameters( + scale_estimation_params=AdvancedScaleEstimationParameters(32, 5, 10, 1.0) + ), + }, + "backends": [BackendType.OV, BackendType.TORCH], + }, ] From e3f12c216b0d8f78d409bd37a0612deb97f9a0b6 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 21 Jan 2025 14:10:11 +0100 Subject: [PATCH 36/46] torch.no_grad -> torch.inference_mode --- nncf/experimental/torch/fx/quantization/quantize_model.py | 2 +- .../quantization/algorithms/weight_compression/torch_backend.py | 2 +- nncf/torch/quantization/quantize_model.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nncf/experimental/torch/fx/quantization/quantize_model.py b/nncf/experimental/torch/fx/quantization/quantize_model.py index ef2aac355ca..c30f653ce6e 100644 --- a/nncf/experimental/torch/fx/quantization/quantize_model.py +++ b/nncf/experimental/torch/fx/quantization/quantize_model.py @@ -135,7 +135,7 @@ def compress_weights_impl( """ Implementation of the `compress_weights()` method for the Torch Fx backend. """ - with torch.no_grad(): + with torch.inference_mode(): compression_algorithm = WeightCompression( mode, ratio, diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 518be41721d..1f843679737 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -62,7 +62,7 @@ def _prepare_inputs( - tensor: torch.Tensor, scale: torch.Tensor, zero_point=Optional[torch.Tensor] + tensor: torch.Tensor, scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None ) -> Tuple[Tensor, Tensor, Optional[Tensor]]: tensor, scale = Tensor(tensor), Tensor(scale) if zero_point is not None: diff --git a/nncf/torch/quantization/quantize_model.py b/nncf/torch/quantization/quantize_model.py index ddc97c13d90..57408abff31 100644 --- a/nncf/torch/quantization/quantize_model.py +++ b/nncf/torch/quantization/quantize_model.py @@ -101,7 +101,7 @@ def compress_weights_impl( """ Implementation of the `compress_weights()` method for the PyTorch backend. """ - with torch.no_grad(): + with torch.inference_mode(): compression_algorithm = WeightCompression( mode, ratio, From a347a2598d063822a0deb98fe369468792433f7d Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Tue, 21 Jan 2025 14:50:20 +0100 Subject: [PATCH 37/46] upd reference --- .../template_test_weights_compression.py | 7 +-- tests/openvino/native/models.py | 6 +-- .../quantization/test_weights_compression.py | 44 +++++-------------- tests/torch/ptq/test_weights_compression.py | 42 ++++-------------- 4 files changed, 26 insertions(+), 73 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index d5bdf386300..31cfa669df1 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -147,8 +147,9 @@ def test_scale_estimation(self, mocker): model = self.get_model_for_test_scale_estimation() # prepare dataset with one input tensor - input = np.arange(0, 32 * 32, dtype=np.float32).reshape(1, 32, 32) - input[0, 15] *= 100 # make one channel relatively higher. + input = np.arange(0, 8 * 8, dtype=np.float32).reshape(1, 8, 8) + input[0, 4] *= 100 # make one channel relatively higher. + input = self.to_tensor(input) dataset = Dataset([input]) @@ -156,7 +157,7 @@ def test_scale_estimation(self, mocker): model, mode=CompressWeightsMode.INT4_ASYM, ratio=1.0, - group_size=32, + group_size=4, scale_estimation=True, all_layers=True, dataset=dataset, diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 56ef047f97f..c9ec2f8343d 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1187,11 +1187,11 @@ def _create_ov_model(self): return model -class MLP(OVReferenceModel): +class MatMul(OVReferenceModel): def _create_ov_model(self): - input_node = opset.parameter([1, 32, 32], name="Input") + input_node = opset.parameter([1, 8, 8], name="Input") - weights_data = np.arange(0, 32 * 32, dtype=np.float32).reshape(32, 32) + weights_data = np.arange(0, 8 * 8, dtype=np.float32).reshape(8, 8) weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights") matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul") diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index e0d072cb04a..6128b2d7829 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -50,13 +50,13 @@ from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression from tests.openvino.native.common import get_actual_reference_for_current_openvino -from tests.openvino.native.models import MLP from tests.openvino.native.models import AWQActMatmulModel from tests.openvino.native.models import AWQMatmulModel from tests.openvino.native.models import GatherAndMatmulShareData from tests.openvino.native.models import GatherWithTwoReductionAxes from tests.openvino.native.models import IdentityMatmul from tests.openvino.native.models import IntegerModel +from tests.openvino.native.models import MatMul from tests.openvino.native.models import ModelNamedConsts from tests.openvino.native.models import OVReferenceModel from tests.openvino.native.models import SequentialMatmulModel @@ -1550,43 +1550,19 @@ def check_weights(model: ov.Model, ref_ids: List[int]) -> None: @staticmethod def get_model_for_test_scale_estimation(): - return MLP().ov_model + return MatMul().ov_model @staticmethod def get_scale_estimation_ref(): return np.array( [ - [[2.0666666]], - [[3.7624273]], - [[5.884783]], - [[8.03606]], - [[10.136832]], - [[12.291862]], - [[14.34415]], - [[16.449669]], - [[18.608639]], - [[20.802698]], - [[22.9477]], - [[25.083504]], - [[27.152409]], - [[29.141987]], - [[31.171442]], - [[33.044716]], - [[35.178047]], - [[37.31138]], - [[39.444714]], - [[41.578045]], - [[43.71138]], - [[45.844715]], - [[47.978046]], - [[50.11138]], - [[52.244713]], - [[54.378044]], - [[56.511383]], - [[58.644714]], - [[60.77805]], - [[62.91138]], - [[65.044716]], - [[67.17805]], + [[0.2], [0.41354424]], + [[0.6782236], [0.9470368]], + [[1.1691767], [1.4355733]], + [[1.7025099], [1.9689066]], + [[2.2722175], [2.543369]], + [[2.8146443], [3.0858421]], + [[3.3025098], [3.5689068]], + [[3.8358433], [4.1022396]], ] ) diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index cee43f783d0..92f90190a0d 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -383,43 +383,19 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: @staticmethod def get_model_for_test_scale_estimation(): - return LinearModel(torch.arange(0, 32 * 32, dtype=torch.float32).reshape(32, 32)) + return LinearModel(torch.arange(0, 8 * 8, dtype=torch.float32).reshape(8, 8)) @staticmethod def get_scale_estimation_ref(): return torch.tensor( [ - [[2.0666666]], - [[3.7624271]], - [[5.8847833]], - [[8.0360603]], - [[10.1368332]], - [[12.2918606]], - [[14.3441496]], - [[16.4496689]], - [[18.6086369]], - [[20.8027000]], - [[22.9477024]], - [[25.0835018]], - [[27.1524105]], - [[29.1419849]], - [[31.1714401]], - [[33.0447121]], - [[35.1780472]], - [[37.3113823]], - [[39.4447136]], - [[41.5780487]], - [[43.7113838]], - [[45.8447189]], - [[47.9780464]], - [[50.1113815]], - [[52.2447128]], - [[54.3780441]], - [[56.5113831]], - [[58.6447144]], - [[60.7780533]], - [[62.9113808]], - [[65.0447083]], - [[67.1780548]], + [[0.200000], [0.413544]], + [[0.678224], [0.947037]], + [[1.169177], [1.435573]], + [[1.702510], [1.968907]], + [[2.272218], [2.543369]], + [[2.814644], [3.085842]], + [[3.302510], [3.568907]], + [[3.835843], [4.102240]], ] ) From 601f2e4e9290dc5162b6a3b4f2a7dcc3714ebe78 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 22 Jan 2025 10:47:11 +0100 Subject: [PATCH 38/46] #test: upd int4 weight locator for torch --- .../native/quantization/test_weights_compression.py | 4 ++-- tests/torch/ptq/test_weights_compression.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 6128b2d7829..c45f98fa72d 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1545,8 +1545,8 @@ def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray: @staticmethod def check_weights(model: ov.Model, ref_ids: List[int]) -> None: names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4} - ref_nf4_nodes = {f"weights_{i}" for i in ref_ids} - assert ref_nf4_nodes == names + low_precision_nodes = {f"weights_{i}" for i in ref_ids} + assert low_precision_nodes == names @staticmethod def get_model_for_test_scale_estimation(): diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 92f90190a0d..06c69887bff 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -375,11 +375,11 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: @staticmethod def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: - for i, op in enumerate(model.layers): - if i in ref_ids: - assert torch.numel(op.weight) == 8 # workaround to detect uint4 weights - else: - assert torch.numel(op.weight) == 16 + low_precision_nodes = {f"{i}_weight" for i in ref_ids} + for op_name, op in model.nncf.external_op.items(): + for name in low_precision_nodes: + if name in op_name: + assert isinstance(op, INT4SymmetricWeightsDecompressor) @staticmethod def get_model_for_test_scale_estimation(): From 32bc0e59ab71a83e839389bd852b99d60eac9df0 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Wed, 22 Jan 2025 10:59:46 +0100 Subject: [PATCH 39/46] upd licence year --- tests/torch/ptq/test_weights_compression_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/torch/ptq/test_weights_compression_backends.py b/tests/torch/ptq/test_weights_compression_backends.py index bd66093ec19..fccdd0e2c01 100644 --- a/tests/torch/ptq/test_weights_compression_backends.py +++ b/tests/torch/ptq/test_weights_compression_backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2025 Intel Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 568809ce8d380f78526fd8c4fb966e52d6a61535 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Thu, 23 Jan 2025 16:41:10 +0100 Subject: [PATCH 40/46] rebase --- .../weight_compression/scale_estimation.py | 5 +-- .../weight_compression/torch_backend.py | 39 ------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index e09bb77b7f2..0e812b71760 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -13,15 +13,14 @@ from typing import Dict, List, Optional, Tuple, TypeVar import nncf -from nncf import Dataset from nncf.common.graph.graph import NNCFGraph from nncf.common.logging.track_progress import track -from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale @@ -94,8 +93,6 @@ def apply( graph: NNCFGraph, all_weight_params: List[WeightCompressionParameters], statistics: Dict[str, WCTensorStatistic], - statistic_points: Optional[StatisticPointsContainer] = None, - dataset: Optional[Dataset] = None, backend_entity: Optional[WeightCompressionAlgoBackend] = None, ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: """ diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 1f843679737..d4741a6c2d1 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -37,12 +37,9 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType from nncf.torch.dynamic_graph.scope import Scope @@ -61,34 +58,6 @@ from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor -def _prepare_inputs( - tensor: torch.Tensor, scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None -) -> Tuple[Tensor, Tensor, Optional[Tensor]]: - tensor, scale = Tensor(tensor), Tensor(scale) - if zero_point is not None: - zero_point = Tensor(zero_point) - return tensor, scale, zero_point - - -def get_compress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]: - def _forward_fn(inputs: Tuple) -> Tensor: - tensor, scale, zero_point = _prepare_inputs(*inputs) - quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) - return quantized.data - - return _forward_fn - - -def get_compress_decompress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]: - def _forward_fn(inputs: Tuple) -> Tensor: - tensor, scale, zero_point = _prepare_inputs(*inputs) - quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config) - dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point) - return dequantized.data - - return _forward_fn - - class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): TARGET_TYPE_TO_PT_INS_TYPE_MAP = { TargetType.PRE_LAYER_OPERATION: TargetType.OPERATOR_PRE_HOOK, @@ -241,14 +210,6 @@ def insert_adapters( ) -> None: pass - @staticmethod - def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): - return get_compress_decompress_fn(config) - - @staticmethod - def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False): - return get_compress_fn(config) - @staticmethod def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: def filter_func(point: StatisticPoint) -> bool: From 8c7efd601ed71e082a7666adef51916f66364982 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 11:09:58 +0100 Subject: [PATCH 41/46] add test on scale estimation --- nncf/torch/engine.py | 1 - .../template_test_weights_compression.py | 45 +++++++++++++++++-- tests/openvino/native/models.py | 4 +- .../quantization/test_weights_compression.py | 43 ++++++++++++++---- tests/torch/ptq/test_weights_compression.py | 43 +++++++++++++----- 5 files changed, 111 insertions(+), 25 deletions(-) diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py index 239c6857c24..fbbc1d083b3 100644 --- a/nncf/torch/engine.py +++ b/nncf/torch/engine.py @@ -48,5 +48,4 @@ def infer( return self._model(**input_data) if isinstance(input_data, tuple): return self._model(*input_data) - return self._model(input_data) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 31cfa669df1..0457d66ba4f 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -11,6 +11,7 @@ import math from abc import ABC from abc import abstractmethod +from copy import deepcopy from typing import List, TypeVar import numpy as np @@ -143,13 +144,12 @@ def get_scale_estimation_ref(): """ def test_scale_estimation(self, mocker): + """Checks that scales match the reference.""" calc_q_params_spy = mocker.spy(ScaleEstimation, "calculate_quantization_params") model = self.get_model_for_test_scale_estimation() # prepare dataset with one input tensor - input = np.arange(0, 8 * 8, dtype=np.float32).reshape(1, 8, 8) - input[0, 4] *= 100 # make one channel relatively higher. - + input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) input = self.to_tensor(input) dataset = Dataset([input]) @@ -157,10 +157,47 @@ def test_scale_estimation(self, mocker): model, mode=CompressWeightsMode.INT4_ASYM, ratio=1.0, - group_size=4, + group_size=8, scale_estimation=True, all_layers=True, dataset=dataset, ) reference = self.get_scale_estimation_ref() assert fns.allclose(Tensor(reference), calc_q_params_spy.spy_return[0]) + + @abstractmethod + def get_orig_weight(model: TModel) -> Tensor: + """Returns original weight.""" + + @abstractmethod + def get_decompressed_weight(compressed_model: TModel, input: TTensor) -> Tensor: + """Returns decompressed weight""" + + def test_scale_estimation_outlier_channel_has_lowest_error(self): + """Checks that outlier channel has a lowest error after quantization.""" + OUTLIER_CHANNEL = 4 + model = self.get_model_for_test_scale_estimation() + + # prepare dataset with one input tensor + input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) + input[ + :, :, OUTLIER_CHANNEL + ] *= 1000 # make one channel relatively higher. This channel should have lowest error. + input = self.to_tensor(input) + dataset = Dataset([input]) + + compressed_model = compress_weights( + deepcopy(model), + mode=CompressWeightsMode.INT4_ASYM, + ratio=1.0, + group_size=-1, + scale_estimation=True, + all_layers=True, + dataset=dataset, + ) + + decompressed_weight = self.get_decompressed_weight(compressed_model, input) + original_weight = self.get_orig_weight(model) + diff = (decompressed_weight - original_weight) ** 2 + layer_err = fns.mean(diff, axis=0) / fns.mean(original_weight**2, axis=0) + assert fns.argsort(layer_err)[0] == OUTLIER_CHANNEL diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index c9ec2f8343d..e5efdaf8235 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1189,9 +1189,9 @@ def _create_ov_model(self): class MatMul(OVReferenceModel): def _create_ov_model(self): - input_node = opset.parameter([1, 8, 8], name="Input") + input_node = opset.parameter([1, 4, 8], name="Input") - weights_data = np.arange(0, 8 * 8, dtype=np.float32).reshape(8, 8) + weights_data = np.arange(0, 16 * 8, dtype=np.float32).reshape(16, 8) weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights") matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul") diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 881f9596936..0a2adc6dd85 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -26,6 +26,7 @@ from nncf.common.utils.debug import nncf_debug from nncf.data.dataset import Dataset from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase +from nncf.openvino.graph.model_transformer import OVModelTransformer from nncf.openvino.graph.node_utils import get_const_value from nncf.parameters import BackupMode from nncf.quantization import compress_weights @@ -1524,13 +1525,39 @@ def get_model_for_test_scale_estimation(): def get_scale_estimation_ref(): return np.array( [ - [[0.2], [0.41354424]], - [[0.6782236], [0.9470368]], - [[1.1691767], [1.4355733]], - [[1.7025099], [1.9689066]], - [[2.2722175], [2.543369]], - [[2.8146443], [3.0858421]], - [[3.3025098], [3.5689068]], - [[3.8358433], [4.1022396]], + [[0.473328]], + [[0.929023]], + [[1.446527]], + [[1.920595]], + [[2.517053]], + [[3.030101]], + [[3.584278]], + [[4.04351]], + [[4.620007]], + [[5.165322]], + [[5.710637]], + [[6.122580]], + [[6.655914]], + [[7.237173]], + [[7.722581]], + [[8.255914]], ] ) + + @staticmethod + def get_orig_weight(model: ov.Model) -> Tensor: + for op in model.get_ordered_ops(): + op_name = op.get_friendly_name() + if op.get_type_name() == "Constant" and op_name == "Weights": + return Tensor(op.data) + + @staticmethod + def get_decompressed_weight(compressed_model: ov.Model, input: np.ndarray) -> Tensor: + # Insert extra output to get the compressed weights. + node = [op for op in compressed_model.get_ops() if op.get_friendly_name() == "Weights/fq_weights_1/convert"][0] + output = node.output(0) + extra_outputs = [(output, 0, None)] + model = OVModelTransformer._insert_outputs(compressed_model, extra_outputs) + compiled_model = ov.compile_model(model, device_name="CPU") + weight_output = compiled_model(input)[1] + return Tensor(weight_output) diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index 06c69887bff..0889284b453 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -22,6 +22,7 @@ from nncf import SensitivityMetric from nncf.quantization import compress_weights from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.torch import wrap_model from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor @@ -63,6 +64,9 @@ def forward(self, x): x = layer(x) return x + def get_weight_names_in_exec_order(self): + return [f"{i}_weight" for i in range(len(self.main_values))] + class MatMulModel(torch.nn.Module): def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)): @@ -375,7 +379,8 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: @staticmethod def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: - low_precision_nodes = {f"{i}_weight" for i in ref_ids} + all_names = model.get_weight_names_in_exec_order() + low_precision_nodes = list(map(lambda i: all_names[i], ref_ids)) for op_name, op in model.nncf.external_op.items(): for name in low_precision_nodes: if name in op_name: @@ -383,19 +388,37 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: @staticmethod def get_model_for_test_scale_estimation(): - return LinearModel(torch.arange(0, 8 * 8, dtype=torch.float32).reshape(8, 8)) + return LinearModel(torch.arange(0, 8 * 16, dtype=torch.float32).reshape(16, 8)) @staticmethod def get_scale_estimation_ref(): return torch.tensor( [ - [[0.200000], [0.413544]], - [[0.678224], [0.947037]], - [[1.169177], [1.435573]], - [[1.702510], [1.968907]], - [[2.272218], [2.543369]], - [[2.814644], [3.085842]], - [[3.302510], [3.568907]], - [[3.835843], [4.102240]], + [[0.473328]], + [[0.929023]], + [[1.446527]], + [[1.920595]], + [[2.517054]], + [[3.030102]], + [[3.584279]], + [[4.043509]], + [[4.620008]], + [[5.165322]], + [[5.710637]], + [[6.122581]], + [[6.655914]], + [[7.237174]], + [[7.722580]], + [[8.255914]], ] ) + + @staticmethod + def get_orig_weight(model: torch.nn.Module) -> Tensor: + return Tensor(model.linear.weight) + + @staticmethod + def get_decompressed_weight(compressed_model: torch.nn.Module, input: torch.Tensor) -> Tensor: + weight = compressed_model.linear.weight + unpacked_w = compressed_model.nncf.external_op.weights_decompressor_linear_weight(weight) + return Tensor(unpacked_w) From 64f588fa787842c499431b34cee10c5c83e9e09e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 14:10:07 +0100 Subject: [PATCH 42/46] add check on reducing error after SE --- .../template_test_weights_compression.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 0457d66ba4f..5c94cc30f22 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -22,8 +22,10 @@ from nncf import SensitivityMetric from nncf.data.dataset import Dataset from nncf.quantization import compress_weights +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation +from nncf.quantization.algorithms.weight_compression.weight_lowering import quantize_dequantize_weight from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -39,6 +41,11 @@ MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07 +def get_realtive_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor: + diff = (weight_1 - weight_2) ** 2 + return fns.mean(diff, axis=axis) / fns.mean(weight_1**2, axis=axis) + + class TemplateWeightCompression(ABC): @staticmethod @abstractmethod @@ -180,9 +187,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self): # prepare dataset with one input tensor input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) - input[ - :, :, OUTLIER_CHANNEL - ] *= 1000 # make one channel relatively higher. This channel should have lowest error. + input[:, :, OUTLIER_CHANNEL] *= 1000 # make one channel relatively higher, should have lowest error. input = self.to_tensor(input) dataset = Dataset([input]) @@ -196,8 +201,12 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self): dataset=dataset, ) - decompressed_weight = self.get_decompressed_weight(compressed_model, input) original_weight = self.get_orig_weight(model) - diff = (decompressed_weight - original_weight) ** 2 - layer_err = fns.mean(diff, axis=0) / fns.mean(original_weight**2, axis=0) - assert fns.argsort(layer_err)[0] == OUTLIER_CHANNEL + decompressed_weight_before_se = quantize_dequantize_weight( + original_weight, config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, -1), reduction_axes=1 + ) + decompressed_weight_after_se = self.get_decompressed_weight(compressed_model, input) + error_before_se = get_realtive_error(original_weight, decompressed_weight_before_se) + error_after_se = get_realtive_error(original_weight, decompressed_weight_after_se) + assert fns.argsort(error_after_se)[0] == OUTLIER_CHANNEL # the smallest error on the outlier channel + assert error_before_se[OUTLIER_CHANNEL] > error_after_se[OUTLIER_CHANNEL] From be92375626ae063e6b111f1d3485ae1a49c7b9c0 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 14:30:20 +0100 Subject: [PATCH 43/46] upd atol for model (difference across devices) --- tests/post_training/data/wc_reference_data.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 1309dbbc13c..5bed81c4c79 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -39,6 +39,7 @@ tinyllama_scale_estimation_per_channel_backend_TORCH: metric_value: 0.81389 num_int4: 188 num_int8: 124 + atol: 0.006 # difference across devices: 0.80873 vs 0.81389 tinyllama_data_aware_lora_stateful_backend_OV: metric_value: 0.83446 num_int4: 94 From 9037dd2a684f16563a68f51ee1f9ca1955407264 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 15:09:22 +0100 Subject: [PATCH 44/46] no copy --- .../template_test_weights_compression.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 5c94cc30f22..ae34b1f4c21 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -11,7 +11,6 @@ import math from abc import ABC from abc import abstractmethod -from copy import deepcopy from typing import List, TypeVar import numpy as np @@ -41,7 +40,7 @@ MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07 -def get_realtive_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor: +def get_relative_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor: diff = (weight_1 - weight_2) ** 2 return fns.mean(diff, axis=axis) / fns.mean(weight_1**2, axis=axis) @@ -184,6 +183,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self): """Checks that outlier channel has a lowest error after quantization.""" OUTLIER_CHANNEL = 4 model = self.get_model_for_test_scale_estimation() + original_weight = self.get_orig_weight(model) # prepare dataset with one input tensor input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) @@ -192,7 +192,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self): dataset = Dataset([input]) compressed_model = compress_weights( - deepcopy(model), + model, mode=CompressWeightsMode.INT4_ASYM, ratio=1.0, group_size=-1, @@ -201,12 +201,11 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self): dataset=dataset, ) - original_weight = self.get_orig_weight(model) decompressed_weight_before_se = quantize_dequantize_weight( original_weight, config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, -1), reduction_axes=1 ) decompressed_weight_after_se = self.get_decompressed_weight(compressed_model, input) - error_before_se = get_realtive_error(original_weight, decompressed_weight_before_se) - error_after_se = get_realtive_error(original_weight, decompressed_weight_after_se) + error_before_se = get_relative_error(original_weight, decompressed_weight_before_se) + error_after_se = get_relative_error(original_weight, decompressed_weight_after_se) assert fns.argsort(error_after_se)[0] == OUTLIER_CHANNEL # the smallest error on the outlier channel assert error_before_se[OUTLIER_CHANNEL] > error_after_se[OUTLIER_CHANNEL] From 34570faa2883268a6c39d50bb2cd25de52bdb427 Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 15:24:38 +0100 Subject: [PATCH 45/46] new line --- tests/post_training/data/wc_reference_data.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 5bed81c4c79..864fe133d0a 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -64,4 +64,5 @@ tinyllama_scale_estimation_group_size_64_backend_OV: tinyllama_scale_estimation_group_size_64_backend_TORCH: metric_value: 0.8566 num_int4: 94 - num_int8: 124 \ No newline at end of file + num_int8: 124 + \ No newline at end of file From 5e5440babae128b7ede8d6509de4c3a4b3d14d8e Mon Sep 17 00:00:00 2001 From: Aleksei Kashapov Date: Fri, 24 Jan 2025 15:41:58 +0100 Subject: [PATCH 46/46] polishing --- nncf/torch/engine.py | 1 + .../test_templates/template_test_weights_compression.py | 4 ++-- tests/post_training/data/wc_reference_data.yaml | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py index fbbc1d083b3..ed70e8fb3a3 100644 --- a/nncf/torch/engine.py +++ b/nncf/torch/engine.py @@ -44,6 +44,7 @@ def infer( :param input_data: Inputs for the model. :return: Model outputs. """ + if isinstance(input_data, dict): return self._model(**input_data) if isinstance(input_data, tuple): diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index ae34b1f4c21..6ffc479c1f3 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -137,14 +137,14 @@ def test_mixed_precision(self, mode, all_layers, ratio, ref_ids): @staticmethod @abstractmethod - def get_model_for_test_scale_estimation(): + def get_model_for_test_scale_estimation() -> TModel: """ Returns a backend model for test_scale_estimation. """ @staticmethod @abstractmethod - def get_scale_estimation_ref(): + def get_scale_estimation_ref() -> TTensor: """ Returns the reference output of calculate_quantization_params of ScaleEstimation. """ diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 864fe133d0a..bb13d2eb26d 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -65,4 +65,3 @@ tinyllama_scale_estimation_group_size_64_backend_TORCH: metric_value: 0.8566 num_int4: 94 num_int8: 124 - \ No newline at end of file