From 09126af02382b92c017f8a8bba6c99e1c608b623 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 3 Jan 2025 16:02:37 +0100
Subject: [PATCH 01/46] add torch sample

---
 examples/llm_compression/torch/main.py        | 74 +++++++++++++++++++
 .../weight_compression/algorithm.py           |  2 -
 .../weight_compression/mixed_precision.py     | 17 ++++-
 .../weight_compression/openvino_backend.py    |  2 +
 .../weight_compression/scale_estimation.py    | 10 ++-
 .../weight_compression/torch_backend.py       | 40 ++++++++++
 .../weight_compression/torch_fx_backend.py    | 40 ++++++++++
 nncf/quantization/quantize_model.py           |  8 --
 8 files changed, 178 insertions(+), 15 deletions(-)
 create mode 100644 examples/llm_compression/torch/main.py

diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py
new file mode 100644
index 00000000000..88449b42281
--- /dev/null
+++ b/examples/llm_compression/torch/main.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import torch
+from datasets import load_dataset
+from optimum.exporters.openvino.convert import export_from_model
+from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+
+MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+OUTPUT_DIR = "tinyllama_compressed"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def quantize(model, dataset):
+    quantization_dataset = nncf.Dataset(dataset)
+    compressed_model = nncf.compress_weights(
+        model,
+        dataset=quantization_dataset,
+        mode=nncf.CompressWeightsMode.INT4_SYM,
+        ratio=0.8,
+        sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+    )
+    compressed_model.to("cpu")  # issue with cuda export
+    export_from_model(compressed_model, OUTPUT_DIR, stateful=False, compression_option="fp32", device="cpu")
+
+
+def validate(tokenizer, model):
+    input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
+
+    start_t = time.time()
+    output = model.generate(**input_ids, max_new_tokens=100)
+    print("Elapsed time: ", time.time() - start_t)
+
+    output_text = tokenizer.decode(output[0])
+    print(output_text)
+    return output_text
+
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device)
+    model.eval()
+
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    dataset = dataset.filter(lambda example: len(example["text"]) > 128)  # necessary - why??
+
+    def transform_fn(data):
+        tokenized_text = tokenizer(data["text"], return_tensors="pt")
+        return tokenized_text.data  # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT)
+
+    dataset = dataset.map(transform_fn).with_format("torch", device=device)
+
+    quantize(model, dataset)
+    compressed_model = OVModelForCausalLM.from_pretrained(
+        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
+    )
+    validate(tokenizer, compressed_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index c5a4e2d221c..14d0c800d32 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -702,8 +702,6 @@ def get_matmul_input_to_output_nodes_map(
         """
         matmul_input_to_output_nodes_map = defaultdict(list)
         for node in matmul_nodes:
-            if node.layer_attributes.input_attributes["transpose"]:  # It works only for OV
-                raise nncf.UnsupportedModelError("Transposed input is not supported")
             act_node, output_port_id = self._get_activation_node_and_port(node, graph)
             matmul_input_to_output_nodes_map[(act_node, output_port_id)].append(node)
         return matmul_input_to_output_nodes_map
diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
index f2fbae67a5c..900b639feeb 100644
--- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py
+++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
@@ -211,7 +211,7 @@ class DataBasedCriterion(DataFreeCriterion, ABC):
 
     @property
     def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO]
+        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
 
     def _set_backend_entity(self, model: TModel) -> None:
         model_backend = get_backend(model)
@@ -219,6 +219,14 @@ def _set_backend_entity(self, model: TModel) -> None:
             from nncf.quantization.algorithms.weight_compression.openvino_backend import OVMixedPrecisionAlgoBackend
 
             self._backend_entity = OVMixedPrecisionAlgoBackend(model)
+        elif model_backend == BackendType.TORCH:
+            from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend
+
+            self._backend_entity = PTMixedPrecisionAlgoBackend()
+        elif model_backend == BackendType.TORCH_FX:
+            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXMixedPrecisionAlgoBackend
+
+            self._backend_entity = FXMixedPrecisionAlgoBackend()
         else:
             raise nncf.UnsupportedBackendError(
                 "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value)
@@ -303,7 +311,7 @@ def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -
     def _get_statistics_for_node(
         self, statistic_points: StatisticPointsContainer, node: NNCFNode, nncf_graph: NNCFGraph, stat_key: str
     ) -> List[Tensor]:
-        act_node, output_port_id = self._get_activation_node_and_port(node, nncf_graph)
+        act_node, _ = self._get_activation_node_and_port(node, nncf_graph)
 
         def input_filter_func(point):
             # For the floating-point statistics collected in POST_LAYER style,
@@ -311,8 +319,9 @@ def input_filter_func(point):
             # For the cases when the layer has more than one (0) output port.
             return (
                 self._algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == TargetType.POST_LAYER_OPERATION
-                and point.target_point.port_id == output_port_id
+                and point.target_point.type in [TargetType.POST_LAYER_OPERATION, TargetType.OPERATOR_POST_HOOK]
+                # and point.target_point.port_id == output_port_id
+                # Add a unique filter func for backend??
             )
 
         stats = []
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index ec4dfab4711..2917c4ada3d 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -105,6 +105,8 @@ def mean_statistic_collector(
 
     @staticmethod
     def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int:
+        if node.layer_attributes.input_attributes["transpose"]:  # It works only for OV
+            raise nncf.UnsupportedModelError("Transposed input is not supported")
         constant_ports = node.layer_attributes.get_const_port_ids()
         activation_ports = [
             e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index a5572530857..90ff789a429 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -84,7 +84,7 @@ def __init__(
 
     @property
     def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO]
+        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
 
     def _set_backend_entity(self, model: TModel) -> None:
         """
@@ -101,6 +101,14 @@ def _set_backend_entity(self, model: TModel) -> None:
             from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
 
             self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
+        if model_backend == BackendType.TORCH:
+            from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
+
+            self._backend_entity = PTWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
+        if model_backend == BackendType.TORCH_FX:
+            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
+
+            self._backend_entity = FXWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
         else:
             raise nncf.UnsupportedBackendError(
                 "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value)
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 136c38413ab..3c5914a41cb 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -21,12 +21,20 @@
 from nncf.common.graph.operator_metatypes import OperatorMetatype
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MeanReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator
 from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
+from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -291,3 +299,35 @@ def transform_model(
         transformed_model = PTModelTransformer(model).transform(transformation_layout)
 
         return transformed_model
+
+
+class PTMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, PTWeightCompressionAlgoBackend):
+    @staticmethod
+    def mean_variance_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MeanVarianceReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MeanVarianceTensorStatistic)
+        collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator)
+        return collector
+
+    @staticmethod
+    def max_variance_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MaxVarianceReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MaxVarianceTensorStatistic)
+        collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator)
+        return collector
+
+    @staticmethod
+    def mean_abs_max_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MeanAbsMaxReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MeanMagnitudeTensorStatistic)
+        collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
+        return collector
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index c7c0a685244..794bc4a6427 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -22,10 +22,17 @@
 from nncf.common.graph.operator_metatypes import OperatorMetatype
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MeanReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator
 from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
+from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand
 from nncf.experimental.torch.fx.model_transformer import FXModelTransformer
@@ -34,6 +41,7 @@
 from nncf.experimental.torch.fx.transformations import constant_update_transformation_builder
 from nncf.experimental.torch.fx.transformations import module_insertion_transformation_builder
 from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -251,3 +259,35 @@ def transform_model(
         transformed_model = FXModelTransformer(model).transform(transformation_layout)
 
         return transformed_model
+
+
+class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompressionAlgoBackend):
+    @staticmethod
+    def mean_variance_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MeanVarianceReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MeanVarianceTensorStatistic)
+        collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator)
+        return collector
+
+    @staticmethod
+    def max_variance_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MaxVarianceReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MaxVarianceTensorStatistic)
+        collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator)
+        return collector
+
+    @staticmethod
+    def mean_abs_max_statistic_collector(
+        reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        reducer = MeanAbsMaxReducer(reduction_axes, inplace=True)
+        aggregator = MeanAggregator(num_samples=subset_size)
+        collector = TensorCollector(MeanMagnitudeTensorStatistic)
+        collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
+        return collector
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 4c85ed42aaf..c8eb25d50e4 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -514,7 +514,6 @@ def compress_weights(
 
         options = {
             "awq": awq,
-            "scale_estimation": scale_estimation,
             "gptq": gptq,
             "lora_correction": lora_correction,
         }
@@ -524,12 +523,6 @@ def compress_weights(
                 f"Torch backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
             )
 
-        if sensitivity_metric not in [None, SensitivityMetric.WEIGHT_QUANTIZATION_ERROR]:
-            raise nncf.ParameterNotSupportedError(
-                "Torch backend only supports data-free sensitivity metric. "
-                "Set None or SensitivityMetric.WEIGHT_QUANTIZATION_ERROR."
-            )
-
         if advanced_parameters and advanced_parameters.statistics_path:
             raise nncf.ParameterNotSupportedError("Torch does not support statistics caching.")
 
@@ -546,7 +539,6 @@ def compress_weights(
         else:
             example_input = next(iter(dataset.get_inference_data()))
             model = wrap_model(model, example_input=example_input, trace_parameters=True)
-        dataset = None
         compression_weights_impl = pt_compression_weights_impl
 
     if backend == BackendType.TORCH_FX:

From 67cef71d22b3e06148b65a8de91374d646a16975 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 7 Jan 2025 13:33:53 +0100
Subject: [PATCH 02/46] upd sample

---
 examples/llm_compression/torch/main.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py
index 88449b42281..0a213dadcbd 100644
--- a/examples/llm_compression/torch/main.py
+++ b/examples/llm_compression/torch/main.py
@@ -19,12 +19,12 @@
 
 import nncf
 
-MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
 OUTPUT_DIR = "tinyllama_compressed"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
-def quantize(model, dataset):
+def quantize(model, tokenizer, dataset):
     quantization_dataset = nncf.Dataset(dataset)
     compressed_model = nncf.compress_weights(
         model,
@@ -51,23 +51,23 @@ def validate(tokenizer, model):
 
 def main():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, load_in_8bit=False).to(device)
     model.eval()
 
     dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-    dataset = dataset.filter(lambda example: len(example["text"]) > 128)  # necessary - why??
+    # dataset = dataset.filter(lambda example: len(example["text"]) > 128)  # THIS LEADS TO A WORSE RESULT ON VALIDATION
 
     def transform_fn(data):
-        tokenized_text = tokenizer(data["text"], return_tensors="pt")
+        tokenized_text = tokenizer(data["text"], return_tensors="pt").to(device)
         return tokenized_text.data  # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT)
 
     dataset = dataset.map(transform_fn).with_format("torch", device=device)
 
-    quantize(model, dataset)
-    compressed_model = OVModelForCausalLM.from_pretrained(
+    quantize(model, tokenizer, dataset)
+    model = OVModelForCausalLM.from_pretrained(
         OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
     )
-    validate(tokenizer, compressed_model)
+    validate(tokenizer, model)
 
 
 if __name__ == "__main__":

From 94d28503e037f570d5311a805633853a16dbc5a9 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 7 Jan 2025 13:42:03 +0100
Subject: [PATCH 03/46] fix reducers

---
 .../common/tensor_statistics/collectors.py    | 21 +++++++++++++------
 .../weight_compression/torch_backend.py       |  6 +++---
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/nncf/experimental/common/tensor_statistics/collectors.py b/nncf/experimental/common/tensor_statistics/collectors.py
index ce138709d71..dd4e15a114f 100644
--- a/nncf/experimental/common/tensor_statistics/collectors.py
+++ b/nncf/experimental/common/tensor_statistics/collectors.py
@@ -464,18 +464,27 @@ def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]:
 
 
 class MeanVarianceReducer(TensorReducerBase):
-    def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]:
-        raise NotImplementedError()
+    def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]:
+        x = x[0]
+        reduction_axes = self._get_reduction_axes(x)
+        variance = fns.var(x, reduction_axes)
+        return [fns.mean(variance)]
 
 
 class MaxVarianceReducer(TensorReducerBase):
-    def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]:
-        raise NotImplementedError()
+    def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]:
+        x = x[0]
+        reduction_axes = self._get_reduction_axes(x)
+        variance = fns.var(x, reduction_axes)
+        return [fns.max(variance)]
 
 
 class MeanAbsMaxReducer(TensorReducerBase):
-    def _reduce_out_of_place(self, x: List[TensorType]) -> List[TensorType]:
-        raise NotImplementedError()
+    def _reduce_out_of_place(self, x: List[Tensor]) -> List[Tensor]:
+        x = fns.abs(x[0])
+        reduction_axes = self._get_reduction_axes(x)
+        abs_max = fns.max(x, reduction_axes, keepdims=self._keepdims)
+        return [fns.mean(abs_max)]
 
 
 class QuantileReducerBase(TensorReducerBase):
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 3c5914a41cb..7ade18cc8c9 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -306,7 +306,7 @@ class PTMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, PTWeightCompression
     def mean_variance_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MeanVarianceReducer(reduction_axes, inplace=True)
+        reducer = MeanVarianceReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MeanVarianceTensorStatistic)
         collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator)
@@ -316,7 +316,7 @@ def mean_variance_statistic_collector(
     def max_variance_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MaxVarianceReducer(reduction_axes, inplace=True)
+        reducer = MaxVarianceReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MaxVarianceTensorStatistic)
         collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator)
@@ -326,7 +326,7 @@ def max_variance_statistic_collector(
     def mean_abs_max_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MeanAbsMaxReducer(reduction_axes, inplace=True)
+        reducer = MeanAbsMaxReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)

From db42165c7138f89f0ab9ad7c0f4253abf319330f Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 7 Jan 2025 15:50:05 +0100
Subject: [PATCH 04/46] align SE with GPTQ

---
 .../weight_compression/algorithm.py           | 27 +++++-----
 .../weight_compression/scale_estimation.py    | 54 ++++++++-----------
 2 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 14d0c800d32..8d8908bbe5b 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -266,6 +266,14 @@ def __init__(
                 subset_size=gptq_params.subset_size,
                 scale_estimation=self._scale_estimation,
             )
+        if self._scale_estimation:
+            scale_estimation_params = self._advanced_parameters.scale_estimation_params
+            self._scale_estimation_algo = ScaleEstimation(
+                scale_estimation_params.subset_size,
+                scale_estimation_params.initial_steps,
+                scale_estimation_params.scale_steps,
+                scale_estimation_params.weight_penalty,
+            )
 
         self._data_aware_mixed_precision = (
             self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0
@@ -616,18 +624,13 @@ def apply(
             )
         else:
             if self._scale_estimation:
-                scale_estimation_params = self._advanced_parameters.scale_estimation_params
-                scales, zero_points = ScaleEstimation(
-                    model,
-                    self._backend_entity.name_to_node_mapping,
-                    all_weight_params,
-                    nodes_to_compress,
-                    statistics,
-                    scale_estimation_params.subset_size,
-                    scale_estimation_params.initial_steps,
-                    scale_estimation_params.scale_steps,
-                    scale_estimation_params.weight_penalty,
-                ).apply(model, graph)
+                scales, zero_points = self._scale_estimation_algo.apply(
+                    model=model,
+                    graph=graph,
+                    all_weight_params=all_weight_params,
+                    statistics=statistics,
+                    backend_entity=self._backend_entity,
+                )
 
             if self._lora_correction:
                 lora_correction_params = self._advanced_parameters.lora_correction_params
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 90ff789a429..f9d19c632e8 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -10,12 +10,11 @@
 # limitations under the License.
 
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+from typing import Dict, List, Optional, Tuple, TypeVar
 
 import nncf
 from nncf import Dataset
 from nncf.common.graph.graph import NNCFGraph
-from nncf.common.graph.graph import NNCFNode
 from nncf.common.logging.track_progress import track
 from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
 from nncf.common.utils.backend import BackendType
@@ -48,22 +47,12 @@ class ScaleEstimation:
 
     def __init__(
         self,
-        model: TModel,
-        name_to_node_mapping: Dict[str, Any],
-        all_weight_params: List[WeightCompressionParameters],
-        nodes_to_compress: List[NNCFNode],
-        statistics: Dict[str, WCTensorStatistic],
         subset_size: int = 32,
         initial_steps: int = 5,
         scale_steps: int = 10,
         weight_penalty: float = -1.0,
     ):
         """
-        :param model: Model for applying algorithm.
-        :param name_to_node_mapping: Name to node mapping for updating node weights.
-        :param all_weight_params: List of all weight parameters.
-        :param nodes_to_compress: List of nodes for processing.
-        :param statistics: Input activation statistics for each node.
         :param subset_size: The number of samples for scale estimation.
         :param initial_steps: The number of the steps for absmax scale rectification.
         :param scale_steps: The number of the steps for grid search scale rectification
@@ -71,17 +60,11 @@ def __init__(
         :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply.
         """
         super().__init__()
-        self.name_to_node_mapping = name_to_node_mapping
-        self._all_weight_params = all_weight_params
-        self._nodes_to_compress = nodes_to_compress
-        self._statistics = statistics
         self._subset_size = subset_size
         self._initial_steps = initial_steps
         self._scale_steps = scale_steps
         self._weight_penalty = weight_penalty
 
-        self._set_backend_entity(model)
-
     @property
     def available_backends(self) -> List[BackendType]:
         return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
@@ -91,35 +74,36 @@ def _set_backend_entity(self, model: TModel) -> None:
         Creates a helper class with a backed-specific logic of the algorithm.
 
         :param model: Backend-specific input model.
-        :param all_weight_params: List of all weight parameters.
-        :param nodes_to_compress: List of nodes for processing.
-        :param activations: The input activations of the layers considered for compression.
         """
-
         model_backend = get_backend(model)
         if model_backend == BackendType.OPENVINO:
             from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
 
-            self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
-        if model_backend == BackendType.TORCH:
+            self._backend_entity = OVWeightCompressionAlgoBackend(model)
+        elif model_backend == BackendType.TORCH:
             from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
 
-            self._backend_entity = PTWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
-        if model_backend == BackendType.TORCH_FX:
+            self._backend_entity = PTWeightCompressionAlgoBackend()
+        elif model_backend == BackendType.TORCH_FX:
             from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
 
-            self._backend_entity = FXWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
+            self._backend_entity = FXWeightCompressionAlgoBackend()
         else:
             raise nncf.UnsupportedBackendError(
-                "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value)
+                "Cannot return backend-specific Scale Estimation entity because {} is not supported!".format(
+                    model_backend.value
+                )
             )
 
     def apply(
         self,
         model: TModel,
         graph: NNCFGraph,
+        all_weight_params: List[WeightCompressionParameters],
+        statistics: Dict[str, WCTensorStatistic],
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
+        backend_entity: Optional[WeightCompressionAlgoBackend] = None,
     ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """
         Estimates better scale for the int4 nodes in the model.
@@ -127,26 +111,30 @@ def apply(
         MatMul with compressed weights.
         The algorithm computes weighted scale for the group of weights in MatMul, which
         shared the same scale.
-
+        :param all_weight_params: List of all weight parameters.
+        :param statistics: Input activation statistics for each node.
         :param model: Model for applying algorithm.
         :param graph: Model graph.
         :param statistic_points: Statistic points with collected statistics values.
         :param dataset: A representative dataset for the calibration process.
+        :param backend_entity: Weight compression algorithm backend.
         :return: Two dictionaries for estimated scales and zero points for each weight name.
         """
-
+        self._backend_entity = backend_entity
+        if self._backend_entity is None:
+            self._set_backend_entity(model)
         scales, zero_points = dict(), dict()
 
-        for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
+        for wp in track(all_weight_params, description="Applying Scale Estimation"):
             weight_name = wp.weight_name
             node_name = wp.node_with_weight.node_name
             config = wp.compression_config
 
-            if config.num_bits != 4 or node_name not in self._statistics:
+            if config.num_bits != 4 or node_name not in statistics:
                 scales[weight_name] = None
                 continue
 
-            stats = self._statistics[node_name]
+            stats = statistics[node_name]
 
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
             if len(weight_data) != 1:  # not supported by the algorithm

From f96788ae145af57ace91e804a373cf3beb490d1f Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 10:46:32 +0100
Subject: [PATCH 05/46] add tests

---
 .../test_reducers_and_aggregators.py          | 45 +++++++++++++++++++
 .../test_reducers_and_aggregators.py          |  5 ++-
 .../ptq/test_weights_compression_backends.py  | 38 ++++++++++++++++
 3 files changed, 86 insertions(+), 2 deletions(-)
 create mode 100644 tests/torch/ptq/test_weights_compression_backends.py

diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py
index a693efed0ac..db4b646caca 100644
--- a/tests/common/experimental/test_reducers_and_aggregators.py
+++ b/tests/common/experimental/test_reducers_and_aggregators.py
@@ -24,8 +24,11 @@
 from nncf.experimental.common.tensor_statistics.collectors import AggregationAxes
 from nncf.experimental.common.tensor_statistics.collectors import HAWQAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MaxAggregator
+from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
+from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
 from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MeanNoOutliersAggregator
+from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import MedianAbsoluteDeviationAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MedianAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MedianNoOutliersAggregator
@@ -569,3 +572,45 @@ def test_hawq_aggregator(self, inputs, reference_output):
 
         ret_val = aggregator.aggregate()
         assert fns.allclose(ret_val, reference_output)
+
+    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
+    def test_mean_variance_reducer(self, axes):
+        reducer = MeanVarianceReducer(reduction_axes=axes)
+        np_data = np.random.rand(3, 10, 10, 4)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+
+        # Calculate expected result using numpy
+        variance = np.var(np_data, axis=axes)
+        expected_result = np.mean(variance)
+
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))
+
+    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
+    def test_mean_abs_max_reducer(self, axes):
+        reducer = MeanAbsMaxReducer(reduction_axes=axes)
+        np_data = np.random.rand(3, 10, 10, 4)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+
+        # Calculate expected result using numpy
+        abs_max = np.max(np.abs(np_data), axis=axes)
+        expected_result = np.mean(abs_max)
+
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))
+
+    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
+    def test_max_variance_reducer(self, axes):
+        reducer = MaxVarianceReducer(reduction_axes=axes)
+        np_data = np.random.rand(3, 10, 10, 4)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+
+        # Calculate expected result using numpy
+        variance = np.var(np_data, axis=axes)
+        expected_result = np.max(variance)
+
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))
diff --git a/tests/openvino/native/quantization/test_reducers_and_aggregators.py b/tests/openvino/native/quantization/test_reducers_and_aggregators.py
index eb2a23eaff2..1e306829fd1 100644
--- a/tests/openvino/native/quantization/test_reducers_and_aggregators.py
+++ b/tests/openvino/native/quantization/test_reducers_and_aggregators.py
@@ -75,12 +75,13 @@ def squeeze_tensor(self, ref_tensor: List[Any], axes: Optional[Tuple[int]] = Non
     def cast_tensor(self, tensor, dtype: Dtype):
         return tensor
 
+    @pytest.mark.parametrize("inplace", [True, False])
     @pytest.mark.parametrize("reducer_cls,reduction_axes,ref_value", MIXED_PRECISION_REDUCERS_REF_VALUES)
-    def test_mixed_precision_reducers(self, reducer_cls, reduction_axes, ref_value):
+    def test_mixed_precision_reducers(self, reducer_cls, reduction_axes, ref_value, inplace):
         input_ = np.arange(2 * 4 * 8).reshape(2, 4, 8)
         input_[:, :2] *= 2
 
-        reducer = reducer_cls(reduction_axes=reduction_axes, inplace=True)
+        reducer = reducer_cls(reduction_axes=reduction_axes, inplace=inplace)
         inplace_fn = reducer.get_inplace_fn()
 
         ov_model_input = opset.parameter(input_.shape)
diff --git a/tests/torch/ptq/test_weights_compression_backends.py b/tests/torch/ptq/test_weights_compression_backends.py
new file mode 100644
index 00000000000..bd66093ec19
--- /dev/null
+++ b/tests/torch/ptq/test_weights_compression_backends.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nncf.quantization.algorithms.weight_compression.mixed_precision import HAWQCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MaxVarianceCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanMaxCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanVarianceCriterion
+from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend
+from tests.cross_fw.test_templates.test_weights_compression_backends import TemplateTestMixedPrecisionAlgoBackend
+
+
+class TestPTMixedPrecisionAlgoBackend(TemplateTestMixedPrecisionAlgoBackend):
+    def get_hawq_with_backend(self, subset_size):
+        hawq = HAWQCriterion(None, None, subset_size=subset_size)
+        hawq._backend_entity = PTMixedPrecisionAlgoBackend()
+        return hawq
+
+    def get_mean_variance_with_backend(self, subset_size: int):
+        mean_variance = MeanVarianceCriterion(None, None, subset_size=subset_size)
+        mean_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return mean_variance
+
+    def get_max_variance_with_backend(self, subset_size: int):
+        max_variance = MaxVarianceCriterion(None, None, subset_size=subset_size)
+        max_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return max_variance
+
+    def get_mean_max_with_backend(self, subset_size: int):
+        mean_max_variance = MeanMaxCriterion(None, None, subset_size=subset_size)
+        mean_max_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return mean_max_variance

From b1d4c477e85258307c302af80b20e81506dac6fa Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 10:58:49 +0100
Subject: [PATCH 06/46] backend method - get_filter_fn_for_statistics

---
 .../algorithms/weight_compression/backend.py   | 14 +++++++++++++-
 .../weight_compression/mixed_precision.py      | 18 ++++--------------
 .../weight_compression/openvino_backend.py     | 16 +++++++++++++++-
 .../weight_compression/torch_backend.py        | 15 ++++++++++++++-
 .../weight_compression/torch_fx_backend.py     | 15 ++++++++++++++-
 5 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 004bb08baef..a4954c3ddd9 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -11,7 +11,7 @@
 
 from abc import ABC
 from abc import abstractmethod
-from typing import Dict, Iterable, List, Optional, Tuple, TypeVar
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, TypeVar
 
 from nncf.common.graph import NNCFGraph
 from nncf.common.graph import NNCFNode
@@ -19,6 +19,7 @@
 from nncf.common.graph.transformations.commands import TargetPoint
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.tensor_statistics.collectors import TensorStatisticCollectorBase
+from nncf.common.tensor_statistics.statistic_point import StatisticPoint
 from nncf.experimental.common.tensor_statistics.collectors import HAWQAggregator
 from nncf.experimental.common.tensor_statistics.collectors import RawReducer
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
@@ -278,3 +279,14 @@ def mean_abs_max_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
         pass
+
+    @staticmethod
+    @abstractmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        """
+        Returns backend-specific callable to filter statistic containers according to its statistic point.
+
+        :param activation_port_id: Activation port id for the statistic collection target node.
+        :param algorithm_key: Current algorithm key.
+        :return: Backend-specific callable to filter statistic containers according to its statistic point.
+        """
diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
index 900b639feeb..0c6775a2c11 100644
--- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py
+++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
@@ -311,22 +311,12 @@ def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -
     def _get_statistics_for_node(
         self, statistic_points: StatisticPointsContainer, node: NNCFNode, nncf_graph: NNCFGraph, stat_key: str
     ) -> List[Tensor]:
-        act_node, _ = self._get_activation_node_and_port(node, nncf_graph)
-
-        def input_filter_func(point):
-            # For the floating-point statistics collected in POST_LAYER style,
-            # we also need to determine the output port id.
-            # For the cases when the layer has more than one (0) output port.
-            return (
-                self._algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type in [TargetType.POST_LAYER_OPERATION, TargetType.OPERATOR_POST_HOOK]
-                # and point.target_point.port_id == output_port_id
-                # Add a unique filter func for backend??
-            )
-
+        act_node, act_port_id = self._get_activation_node_and_port(node, nncf_graph)
         stats = []
         for tensor_collector in statistic_points.get_algo_statistics_for_node(
-            act_node.node_name, input_filter_func, self._algorithm_key
+            act_node.node_name,
+            self._backend_entity.get_filter_fn_for_statistics(act_port_id, self._algorithm_key),
+            self._algorithm_key,
         ):
             statistics = tensor_collector.get_statistics()
             for data in statistics.get_data().values():
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 2917c4ada3d..7b9d04e5864 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -8,7 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import openvino as ov
 from openvino import Type
@@ -21,6 +21,7 @@
 from nncf.common.graph.operator_metatypes import OperatorMetatype
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.utils import get_reduction_axes
+from nncf.common.tensor_statistics.statistic_point import StatisticPoint
 from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
 from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
@@ -53,6 +54,8 @@
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
 
+OV_POST_LAYER_TARGET_TYPE = TargetType.POST_LAYER_OPERATION
+
 
 class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     def __init__(self, model: ov.Model, name_to_node_mapping: Dict = None):
@@ -432,3 +435,14 @@ def mean_abs_max_statistic_collector(
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
         return collector
+
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type == OV_POST_LAYER_TARGET_TYPE
+                and point.target_point.port_id == activation_port_id
+            )
+
+        return filter_func
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 7ade18cc8c9..4746d665ea5 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
 
@@ -21,6 +21,7 @@
 from nncf.common.graph.operator_metatypes import OperatorMetatype
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.common.tensor_statistics.statistic_point import StatisticPoint
 from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
 from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
@@ -56,6 +57,8 @@
 from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
+PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK
+
 
 class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     TARGET_TYPE_TO_PT_INS_TYPE_MAP = {
@@ -331,3 +334,13 @@ def mean_abs_max_statistic_collector(
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
         return collector
+
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type == PT_POST_LAYER_TARGET_TYPE
+            )
+
+        return filter_func
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 794bc4a6427..89c78b1a9f4 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
 import torch.fx
@@ -22,6 +22,7 @@
 from nncf.common.graph.operator_metatypes import OperatorMetatype
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.common.tensor_statistics.statistic_point import StatisticPoint
 from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
 from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
@@ -58,6 +59,8 @@
 from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
+PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK
+
 
 class FXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     MATMUL_METATYPES = PTWeightCompressionAlgoBackend.MATMUL_METATYPES
@@ -291,3 +294,13 @@ def mean_abs_max_statistic_collector(
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
         return collector
+
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type == PT_POST_LAYER_TARGET_TYPE
+            )
+
+        return filter_func

From 51ccdd6c059605a7aaa872f937102d7499ad560c Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 11:09:08 +0100
Subject: [PATCH 07/46] fixes

---
 examples/llm_compression/torch/main.py                      | 4 ++--
 .../algorithms/weight_compression/scale_estimation.py       | 5 +++--
 .../algorithms/weight_compression/torch_fx_backend.py       | 6 +++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/main.py
index 0a213dadcbd..a91508eef58 100644
--- a/examples/llm_compression/torch/main.py
+++ b/examples/llm_compression/torch/main.py
@@ -24,7 +24,7 @@
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
-def quantize(model, tokenizer, dataset):
+def quantize(model, dataset):
     quantization_dataset = nncf.Dataset(dataset)
     compressed_model = nncf.compress_weights(
         model,
@@ -63,7 +63,7 @@ def transform_fn(data):
 
     dataset = dataset.map(transform_fn).with_format("torch", device=device)
 
-    quantize(model, tokenizer, dataset)
+    quantize(model, dataset)
     model = OVModelForCausalLM.from_pretrained(
         OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
     )
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index f9d19c632e8..c3fe7050e7c 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -111,10 +111,11 @@ def apply(
         MatMul with compressed weights.
         The algorithm computes weighted scale for the group of weights in MatMul, which
         shared the same scale.
-        :param all_weight_params: List of all weight parameters.
-        :param statistics: Input activation statistics for each node.
+
         :param model: Model for applying algorithm.
         :param graph: Model graph.
+        :param all_weight_params: List of all weight parameters.
+        :param statistics: Input activation statistics for each node.
         :param statistic_points: Statistic points with collected statistics values.
         :param dataset: A representative dataset for the calibration process.
         :param backend_entity: Weight compression algorithm backend.
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 89c78b1a9f4..9764327b5d6 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -269,7 +269,7 @@ class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompression
     def mean_variance_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MeanVarianceReducer(reduction_axes, inplace=True)
+        reducer = MeanVarianceReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MeanVarianceTensorStatistic)
         collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator)
@@ -279,7 +279,7 @@ def mean_variance_statistic_collector(
     def max_variance_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MaxVarianceReducer(reduction_axes, inplace=True)
+        reducer = MaxVarianceReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MaxVarianceTensorStatistic)
         collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator)
@@ -289,7 +289,7 @@ def max_variance_statistic_collector(
     def mean_abs_max_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
-        reducer = MeanAbsMaxReducer(reduction_axes, inplace=True)
+        reducer = MeanAbsMaxReducer(reduction_axes)
         aggregator = MeanAggregator(num_samples=subset_size)
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)

From e37ef52c9aebd9aec7da6394428c1bea6d6d76db Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 11:35:36 +0100
Subject: [PATCH 08/46] sample

---
 examples/llm_compression/torch/{ => llama_3_2}/main.py    | 4 ++--
 examples/llm_compression/torch/llama_3_2/requirements.txt | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)
 rename examples/llm_compression/torch/{ => llama_3_2}/main.py (97%)
 create mode 100644 examples/llm_compression/torch/llama_3_2/requirements.txt

diff --git a/examples/llm_compression/torch/main.py b/examples/llm_compression/torch/llama_3_2/main.py
similarity index 97%
rename from examples/llm_compression/torch/main.py
rename to examples/llm_compression/torch/llama_3_2/main.py
index a91508eef58..ff03296a47e 100644
--- a/examples/llm_compression/torch/main.py
+++ b/examples/llm_compression/torch/llama_3_2/main.py
@@ -19,8 +19,8 @@
 
 import nncf
 
-MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
-OUTPUT_DIR = "tinyllama_compressed"
+MODEL_ID = "unsloth/Llama-3.2-1B"
+OUTPUT_DIR = "compressed"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
diff --git a/examples/llm_compression/torch/llama_3_2/requirements.txt b/examples/llm_compression/torch/llama_3_2/requirements.txt
new file mode 100644
index 00000000000..e29c588e595
--- /dev/null
+++ b/examples/llm_compression/torch/llama_3_2/requirements.txt
@@ -0,0 +1,4 @@
+transformers
+datasets==3.2
+openvino==2024.6
+optimum-intel[openvino]

From d1843ad71a6238eaf5d57fee57379d18822c58ae Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 12:15:45 +0100
Subject: [PATCH 09/46] add tinyllama_data_aware,
 tinyllama_scale_estimation_per_channel for torch

---
 tests/post_training/data/wc_reference_data.yaml | 8 ++++++++
 tests/post_training/model_scope.py              | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 6c48904c91a..3d27d81ee20 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -6,6 +6,10 @@ tinyllama_data_aware_backend_OV:
   metric_value: 0.85767
   num_int4: 94
   num_int8: 124
+tinyllama_data_aware_backend_TORCH:
+  metric_value: 0.85767
+  num_int4: 94
+  num_int8: 124
 tinyllama_data_aware_awq_stateful_backend_OV:
   metric_value: 0.85616
   num_int4: 94
@@ -31,6 +35,10 @@ tinyllama_scale_estimation_per_channel_backend_OV:
   metric_value: 0.81389
   num_int4: 188
   num_int8: 124
+tinyllama_scale_estimation_per_channel_backend_TORCH:
+  metric_value: 0.81389
+  num_int4: 188
+  num_int8: 124
 tinyllama_data_aware_lora_stateful_backend_OV:
   metric_value: 0.83446
   num_int4: 94
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
index beea18e48fc..ad2dd36757d 100644
--- a/tests/post_training/model_scope.py
+++ b/tests/post_training/model_scope.py
@@ -389,7 +389,7 @@
         "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
         "pipeline_cls": LMWeightCompression,
         "compression_params": {"group_size": 64, "ratio": 0.8, "mode": CompressWeightsMode.INT4_SYM},
-        "backends": [BackendType.OV],
+        "backends": [BackendType.OV, BackendType.TORCH],
     },
     {
         "reported_name": "tinyllama_data_aware_awq_stateful",
@@ -496,7 +496,7 @@
             "mode": CompressWeightsMode.INT4_ASYM,
             "scale_estimation": True,
         },
-        "backends": [BackendType.OV],
+        "backends": [BackendType.OV, BackendType.TORCH],
     },
     {
         "reported_name": "tinyllama_data_aware_lora_stateful",

From cd79e80c3affc8342cb6d7689b4442b834a618a9 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 12:17:54 +0100
Subject: [PATCH 10/46] fix precommit

---
 tests/torch/experimental/sparsify_activations/test_algo.py  | 2 --
 .../experimental/sparsify_activations/test_components.py    | 6 +-----
 tests/torch/ptq/test_weights_compression.py                 | 2 --
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/torch/experimental/sparsify_activations/test_algo.py b/tests/torch/experimental/sparsify_activations/test_algo.py
index c5616aa8372..64d245d3639 100644
--- a/tests/torch/experimental/sparsify_activations/test_algo.py
+++ b/tests/torch/experimental/sparsify_activations/test_algo.py
@@ -128,7 +128,6 @@ class SparsifyActivationsAlgorithmTestDesc:
 @pytest.mark.parametrize("compress_weights", [False, True], scope="class")
 @pytest.mark.parametrize("use_cuda", [False, True], ids=["cpu", "cuda"], scope="class")
 class TestSparsifyActivationsAlgorithm:
-
     @pytest.fixture(autouse=True, scope="class")
     def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_weights: bool, use_cuda: bool):
         if use_cuda and not torch.cuda.is_available():
@@ -146,7 +145,6 @@ def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_we
                 model = nncf.compress_weights(
                     model,
                     mode=nncf.CompressWeightsMode.INT8_SYM,
-                    dataset=dataset,
                 )
             model = nncf.experimental.torch.sparsify_activations.sparsify_activations(
                 model=model,
diff --git a/tests/torch/experimental/sparsify_activations/test_components.py b/tests/torch/experimental/sparsify_activations/test_components.py
index f74ceab3c1b..938a4facf1b 100644
--- a/tests/torch/experimental/sparsify_activations/test_components.py
+++ b/tests/torch/experimental/sparsify_activations/test_components.py
@@ -217,11 +217,7 @@ def create_model_and_dataset(self, compress_weights: bool = False):
         model = ThreeLinearModel()
         dataset = nncf.Dataset(torch.randint(0, 30, (3, 2, 8)))
         if compress_weights:
-            model = nncf.compress_weights(
-                model,
-                mode=nncf.CompressWeightsMode.INT8_SYM,
-                dataset=dataset,
-            )
+            model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM)
         else:
             model = wrap_model(
                 model,
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 12f113863f7..83e211a5bd2 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -236,10 +236,8 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params):
 @pytest.mark.parametrize(
     "params",
     (
-        *({"sensitivity_metric": metric} for metric in DATA_BASED_SENSITIVITY_METRICS),
         {"gptq": True},
         {"awq": True},
-        {"scale_estimation": True},
         {"lora_correction": True},
     ),
 )

From df6b43ba0cf9d75158dd63d31c6dee1f84173d49 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 12:30:21 +0100
Subject: [PATCH 11/46] minor

---
 tests/torch/ptq/test_weights_compression.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 83e211a5bd2..091e9192fe4 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -30,14 +30,7 @@
 from nncf.torch.quantization.quantize_functions import unpack_uint4
 from tests.torch.test_models.synthetic import ShortTransformer
 
-DATA_BASED_SENSITIVITY_METRICS = (
-    SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-    SensitivityMetric.MEAN_ACTIVATION_VARIANCE,
-    SensitivityMetric.MAX_ACTIVATION_VARIANCE,
-    SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE,
-)
-
-ALL_SENSITIVITY_METRICS = DATA_BASED_SENSITIVITY_METRICS + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,)
+ALL_SENSITIVITY_METRICS = all_sensitivity_metrics = list(SensitivityMetric)
 
 INT8_MODES = (CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM)
 INT4_MODES = (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM)

From 368054a28985093b141158eea83c53554cc9c15e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 12:47:45 +0100
Subject: [PATCH 12/46] refactor test

---
 .../test_reducers_and_aggregators.py          | 90 ++++++++++---------
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py
index 5b6c2f45339..5f83bf64875 100644
--- a/tests/common/experimental/test_reducers_and_aggregators.py
+++ b/tests/common/experimental/test_reducers_and_aggregators.py
@@ -235,6 +235,54 @@ def test_quantile_reducers(self, reducer_name, ref, reducers):
         for i, ref_ in enumerate(ref):
             assert fns.allclose(val[i], self.get_nncf_tensor(ref_))
 
+    @pytest.mark.parametrize(
+        "axes, np_data, reference",
+        [
+            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 14.25],
+            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 15.875],
+            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+        ],
+    )
+    def test_mean_variance_reducer(self, axes, np_data, reference):
+        reducer = MeanVarianceReducer(reduction_axes=axes)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(reference))
+
+    @pytest.mark.parametrize(
+        "axes, np_data, reference",
+        [
+            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0],
+            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 4.16666],
+            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 6.33333],
+            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0],
+        ],
+    )
+    def test_mean_abs_max_reducer(self, axes, np_data, reference):
+        reducer = MeanAbsMaxReducer(reduction_axes=axes)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(reference))
+
+    @pytest.mark.parametrize(
+        "axes, np_data, reference",
+        [
+            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 64.0],
+            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 36.1875],
+            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+        ],
+    )
+    def test_max_variance_reducer(self, axes, np_data, reference):
+        reducer = MaxVarianceReducer(reduction_axes=axes)
+        nncf_data = self.get_nncf_tensor(np_data)
+        result = reducer._reduce_out_of_place([nncf_data])
+        assert len(result) == 1
+        assert fns.allclose(result[0], self.get_nncf_tensor(reference))
+
     @pytest.mark.parametrize(
         "reducer_name,ref,kwargs",
         [
@@ -572,45 +620,3 @@ def test_hawq_aggregator(self, inputs, reference_output):
 
         ret_val = aggregator.aggregate()
         assert fns.allclose(ret_val, reference_output)
-
-    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
-    def test_mean_variance_reducer(self, axes):
-        reducer = MeanVarianceReducer(reduction_axes=axes)
-        np_data = np.random.rand(3, 10, 10, 4)
-        nncf_data = self.get_nncf_tensor(np_data)
-        result = reducer._reduce_out_of_place([nncf_data])
-
-        # Calculate expected result using numpy
-        variance = np.var(np_data, axis=axes)
-        expected_result = np.mean(variance)
-
-        assert len(result) == 1
-        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))
-
-    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
-    def test_mean_abs_max_reducer(self, axes):
-        reducer = MeanAbsMaxReducer(reduction_axes=axes)
-        np_data = np.random.rand(3, 10, 10, 4)
-        nncf_data = self.get_nncf_tensor(np_data)
-        result = reducer._reduce_out_of_place([nncf_data])
-
-        # Calculate expected result using numpy
-        abs_max = np.max(np.abs(np_data), axis=axes)
-        expected_result = np.mean(abs_max)
-
-        assert len(result) == 1
-        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))
-
-    @pytest.mark.parametrize("axes", [None, (0,), (0, 1), (0, 1, 2)])
-    def test_max_variance_reducer(self, axes):
-        reducer = MaxVarianceReducer(reduction_axes=axes)
-        np_data = np.random.rand(3, 10, 10, 4)
-        nncf_data = self.get_nncf_tensor(np_data)
-        result = reducer._reduce_out_of_place([nncf_data])
-
-        # Calculate expected result using numpy
-        variance = np.var(np_data, axis=axes)
-        expected_result = np.max(variance)
-
-        assert len(result) == 1
-        assert fns.allclose(result[0], self.get_nncf_tensor(expected_result))

From e2a6f46fc6d2c221b6f019ceaa1e6de12b6e78e3 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 12:54:35 +0100
Subject: [PATCH 13/46] add WA for dataset

---
 nncf/quantization/quantize_model.py                             | 2 ++
 .../torch/experimental/sparsify_activations/test_components.py  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 7e99a7e71a4..64ba790c264 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -539,6 +539,8 @@ def compress_weights(
         else:
             example_input = next(iter(dataset.get_inference_data()))
             model = wrap_model(model, example_input=example_input, trace_parameters=True)
+        if mode in (CompressWeightsMode.INT8, CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM):
+            dataset = None  # workaround as INT8 mode still does not support dataset
         compression_weights_impl = pt_compression_weights_impl
 
     if backend == BackendType.TORCH_FX:
diff --git a/tests/torch/experimental/sparsify_activations/test_components.py b/tests/torch/experimental/sparsify_activations/test_components.py
index 938a4facf1b..09ed2adc740 100644
--- a/tests/torch/experimental/sparsify_activations/test_components.py
+++ b/tests/torch/experimental/sparsify_activations/test_components.py
@@ -217,7 +217,7 @@ def create_model_and_dataset(self, compress_weights: bool = False):
         model = ThreeLinearModel()
         dataset = nncf.Dataset(torch.randint(0, 30, (3, 2, 8)))
         if compress_weights:
-            model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM)
+            model = nncf.compress_weights(model, mode=nncf.CompressWeightsMode.INT8_SYM, dataset=dataset)
         else:
             model = wrap_model(
                 model,

From dbf2b1df017374c925bd10416e6c1d522572737b Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 13:07:41 +0100
Subject: [PATCH 14/46] fix

---
 tests/torch/experimental/sparsify_activations/test_algo.py | 2 ++
 tests/torch/ptq/test_weights_compression.py                | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/torch/experimental/sparsify_activations/test_algo.py b/tests/torch/experimental/sparsify_activations/test_algo.py
index 64d245d3639..c5616aa8372 100644
--- a/tests/torch/experimental/sparsify_activations/test_algo.py
+++ b/tests/torch/experimental/sparsify_activations/test_algo.py
@@ -128,6 +128,7 @@ class SparsifyActivationsAlgorithmTestDesc:
 @pytest.mark.parametrize("compress_weights", [False, True], scope="class")
 @pytest.mark.parametrize("use_cuda", [False, True], ids=["cpu", "cuda"], scope="class")
 class TestSparsifyActivationsAlgorithm:
+
     @pytest.fixture(autouse=True, scope="class")
     def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_weights: bool, use_cuda: bool):
         if use_cuda and not torch.cuda.is_available():
@@ -145,6 +146,7 @@ def setup(self, request, desc: SparsifyActivationsAlgorithmTestDesc, compress_we
                 model = nncf.compress_weights(
                     model,
                     mode=nncf.CompressWeightsMode.INT8_SYM,
+                    dataset=dataset,
                 )
             model = nncf.experimental.torch.sparsify_activations.sparsify_activations(
                 model=model,
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 091e9192fe4..979326cfb36 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -30,7 +30,7 @@
 from nncf.torch.quantization.quantize_functions import unpack_uint4
 from tests.torch.test_models.synthetic import ShortTransformer
 
-ALL_SENSITIVITY_METRICS = all_sensitivity_metrics = list(SensitivityMetric)
+ALL_SENSITIVITY_METRICS = list(SensitivityMetric)
 
 INT8_MODES = (CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM)
 INT4_MODES = (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM)

From 702f8b10b09226c6dea892b4de21309ae575f88f Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 13:27:51 +0100
Subject: [PATCH 15/46] dtype

---
 tests/common/experimental/test_reducers_and_aggregators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py
index 5f83bf64875..7d60f0fc01a 100644
--- a/tests/common/experimental/test_reducers_and_aggregators.py
+++ b/tests/common/experimental/test_reducers_and_aggregators.py
@@ -246,7 +246,7 @@ def test_quantile_reducers(self, reducer_name, ref, reducers):
     )
     def test_mean_variance_reducer(self, axes, np_data, reference):
         reducer = MeanVarianceReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data)
+        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))
@@ -262,7 +262,7 @@ def test_mean_variance_reducer(self, axes, np_data, reference):
     )
     def test_mean_abs_max_reducer(self, axes, np_data, reference):
         reducer = MeanAbsMaxReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data)
+        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))
@@ -278,7 +278,7 @@ def test_mean_abs_max_reducer(self, axes, np_data, reference):
     )
     def test_max_variance_reducer(self, axes, np_data, reference):
         reducer = MaxVarianceReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data)
+        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))

From 24e39c23379571f8ad5640a6a192ee9e8d96f260 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 8 Jan 2025 13:28:54 +0100
Subject: [PATCH 16/46] polishing

---
 .../algorithms/weight_compression/openvino_backend.py       | 6 ++----
 .../algorithms/weight_compression/torch_backend.py          | 5 ++---
 .../algorithms/weight_compression/torch_fx_backend.py       | 5 ++---
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index cda28b0c9d3..90005c09028 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -54,8 +54,6 @@
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
 
-OV_POST_LAYER_TARGET_TYPE = TargetType.POST_LAYER_OPERATION
-
 
 class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     def __init__(self, model: ov.Model, name_to_node_mapping: Dict = None):
@@ -108,7 +106,7 @@ def mean_statistic_collector(
 
     @staticmethod
     def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int:
-        if node.layer_attributes.input_attributes["transpose"]:  # It works only for OV
+        if node.layer_attributes.input_attributes["transpose"]:
             raise nncf.UnsupportedModelError("Transposed input is not supported")
         constant_ports = node.layer_attributes.get_const_port_ids()
         activation_ports = [
@@ -441,7 +439,7 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) ->
         def filter_func(point: StatisticPoint) -> bool:
             return (
                 algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == OV_POST_LAYER_TARGET_TYPE
+                and point.target_point.type == TargetType.POST_LAYER_OPERATION
                 and point.target_point.port_id == activation_port_id
             )
 
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index acf8e081bed..9ccf63977ff 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -57,8 +57,6 @@
 from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
-PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK
-
 
 class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     TARGET_TYPE_TO_PT_INS_TYPE_MAP = {
@@ -340,7 +338,8 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) ->
         def filter_func(point: StatisticPoint) -> bool:
             return (
                 algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == PT_POST_LAYER_TARGET_TYPE
+                and point.target_point.type
+                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
             )
 
         return filter_func
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 032d77983e1..449957d3ecb 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -59,8 +59,6 @@
 from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
-PT_POST_LAYER_TARGET_TYPE = TargetType.OPERATOR_POST_HOOK
-
 
 class FXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     MATMUL_METATYPES = PTWeightCompressionAlgoBackend.MATMUL_METATYPES
@@ -300,7 +298,8 @@ def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) ->
         def filter_func(point: StatisticPoint) -> bool:
             return (
                 algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == PT_POST_LAYER_TARGET_TYPE
+                and point.target_point.type
+                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
             )
 
         return filter_func

From e97078b71fc6f9cb7cc2f08fe94a3c4071ef6030 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 10:28:34 +0100
Subject: [PATCH 17/46] updates for torch

---
 .../weight_compression/algorithm.py           |  15 +--
 .../algorithms/weight_compression/backend.py  |  22 ++--
 .../weight_compression/openvino_backend.py    |  23 ++--
 .../weight_compression/scale_estimation.py    |   9 +-
 .../weight_compression/torch_backend.py       | 105 ++++++++++++++++--
 nncf/torch/engine.py                          |  11 +-
 nncf/torch/quantization/layers.py             |   4 +-
 7 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index f85bd74a8e4..7973712da9a 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -12,7 +12,6 @@
 import operator
 from collections import OrderedDict
 from collections import defaultdict
-from functools import partial
 from functools import reduce
 from typing import Any, Dict, Iterable, List, Optional, Tuple, TypeVar
 
@@ -812,16 +811,6 @@ def _get_statistics_for_weights_compression(
         :return: Collected statistics.
         """
 
-        def input_filter_func(point, port_id):
-            # For the floating-point statistics collected in POST_LAYER style,
-            # we also need to determine the output port id.
-            # For the cases when the layer has more than one (0) output port.
-            return (
-                self._algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == TargetType.POST_LAYER_OPERATION
-                and point.target_point.port_id == port_id
-            )
-
         # For each node we store statistics in a WCTensorStatistics data-class. It contains the following fields:
         #   mean_values=[mean_value_1, ..., mean_value_n]
         #   shapes=[shape_1, ..., shape_n]
@@ -831,7 +820,9 @@ def input_filter_func(point, port_id):
         for (act_node, output_port_id), matmul_nodes in matmul_input_to_output_nodes_map.items():
             tensor_collectors = list(
                 statistic_points.get_algo_statistics_for_node(
-                    act_node.node_name, partial(input_filter_func, port_id=output_port_id), self._algorithm_key
+                    act_node.node_name,
+                    self._backend_entity.get_filter_fn_for_statistics(output_port_id, self._algorithm_key),
+                    self._algorithm_key,
                 )
             )
             # Statistics could be empty in case when the statistics is registered for another algorithm,
diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 9f1f97ef928..c8ea964a288 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -235,6 +235,17 @@ def dump_parameters(
         :param path: Optional list of the paths.
         """
 
+    @staticmethod
+    @abstractmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        """
+        Returns backend-specific callable to filter statistic containers according to its statistic point.
+
+        :param activation_port_id: Activation port id for the statistic collection target node.
+        :param algorithm_key: Current algorithm key.
+        :return: Backend-specific callable to filter statistic containers according to its statistic point.
+        """
+
 
 class AWQAlgoBackend(WeightCompressionAlgoBackend):
     @staticmethod
@@ -279,14 +290,3 @@ def mean_abs_max_statistic_collector(
         reduction_axes: Tuple[int], subset_size: Optional[int] = None
     ) -> TensorCollector:
         pass
-
-    @staticmethod
-    @abstractmethod
-    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
-        """
-        Returns backend-specific callable to filter statistic containers according to its statistic point.
-
-        :param activation_port_id: Activation port id for the statistic collection target node.
-        :param algorithm_key: Current algorithm key.
-        :return: Backend-specific callable to filter statistic containers according to its statistic point.
-        """
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 90005c09028..53cf2c6a03f 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -374,6 +374,7 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p
         s = opset.parameter(s_shape, name="s")
         parameters = [w, s]
         compressed_w = w / s
+        compressed_w.get_rt_info()["nonconvertable_divide_0"] = True
         if z_p_shape is not None:
             zp = opset.parameter(z_p_shape, name="zp")
             parameters.append(zp)
@@ -390,6 +391,17 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p
 
         return lambda parameters: compiled_model(parameters)[0]
 
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type == TargetType.POST_LAYER_OPERATION
+                and point.target_point.port_id == activation_port_id
+            )
+
+        return filter_func
+
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
@@ -433,14 +445,3 @@ def mean_abs_max_statistic_collector(
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
         return collector
-
-    @staticmethod
-    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
-        def filter_func(point: StatisticPoint) -> bool:
-            return (
-                algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type == TargetType.POST_LAYER_OPERATION
-                and point.target_point.port_id == activation_port_id
-            )
-
-        return filter_func
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 6fae3ed905f..b120092f6ca 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -237,7 +237,7 @@ def calculate_quantization_params(
 
         # normalize importances for every group of weights to make sum of them equal to 1.0
         denum = fns.sum(importance, axis=2, keepdims=True)
-        importance = importance / (denum + eps)
+        importance = importance / (denum + eps)  # for each weight in a group
 
         X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
         q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
@@ -274,11 +274,11 @@ def calculate_quantization_params(
         zero_scale = 0.001
         zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
-        input_tensors = [original_weight.data, None]
+        input_tensors = [original_weight.data, None, None]
         if zp is not None:
-            input_tensors.append(zp.data)
+            input_tensors[2] = zp.data
         # iterative rectification of initial scale
-        for i in range(initial_steps):
+        for i in range(initial_steps):  # make several iteration of updating scale
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
             input_tensors[1] = near_to_ideal_scale.data
@@ -406,6 +406,7 @@ def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None
 
 
 def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor:
+    # ideal scale to determine the importance of the weights
     """
     Estimates scales for the given weight, target, zero mask, and importance.
 
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 9ccf63977ff..09cd811f206 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -12,6 +12,7 @@
 from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
+import torch.nn as nn
 
 import nncf
 from nncf.common.graph.definitions import NNCFGraphNodeType
@@ -37,6 +38,7 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
@@ -58,6 +60,79 @@
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
 
+class CompressModule(nn.Module):
+    def __init__(self, level_low, level_high):
+        super().__init__()
+        self.level_low = level_low
+        self.level_high = level_high
+
+    def forward(self, tensor, scale, zero_point=None):
+        # Compressed weights: (w / s) + optional zp
+        x = tensor / scale
+        if zero_point is not None:
+            x = x + zero_point
+        x = torch.round(x)
+        x = torch.clamp(x, min=self.level_low, max=self.level_high)
+        return x
+
+
+class CompressDecompressModule(nn.Module):
+    def __init__(self, compress_mod):
+        super().__init__()
+        self.compress_mod = compress_mod
+
+    def forward(self, tensor, scale, zero_point=None):
+        # Step 1: compress
+        clamp_out = self.compress_mod(tensor, scale, zero_point)
+
+        # Step 2: decompress
+        if zero_point is not None:
+            out = (clamp_out - zero_point) * scale
+        else:
+            out = clamp_out * scale
+        return out
+
+
+def get_compress_pipeline(mode: CompressWeightsMode, num_bits: int, use_torchscript=False):
+    asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
+    level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
+    level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
+
+    compress_module = CompressModule(level_low, level_high)
+
+    # Optionally compile with TorchScript
+    if use_torchscript:
+        compress_module = torch.jit.script(compress_module)
+
+    def _forward_fn(tensor, scale, zero_point):
+        with torch.no_grad():
+            return compress_module(tensor, scale, zero_point)
+
+    return _forward_fn
+
+
+def get_compress_decompress_pipeline(mode, num_bits, use_torchscript=False):
+    compress_module = get_compress_pipeline(
+        mode=mode,
+        num_bits=num_bits,
+        use_torchscript=False,  # We'll handle TorchScript in the final module
+    )
+
+    cdc_module = CompressDecompressModule(compress_module)
+
+    # Optionally compile entire compress+decompress pipeline with TorchScript
+    if use_torchscript:
+        cdc_module = torch.jit.script(cdc_module)
+
+    # Return a simple callable
+    def _forward_fn(parameters):
+        w, s, zp = parameters
+        with torch.no_grad():
+            return cdc_module(w, s, zp)
+
+    return _forward_fn
+
+
 class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     TARGET_TYPE_TO_PT_INS_TYPE_MAP = {
         TargetType.PRE_LAYER_OPERATION: TargetType.OPERATOR_PRE_HOOK,
@@ -210,6 +285,25 @@ def insert_adapters(
     ) -> None:
         pass
 
+    @staticmethod
+    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
+        return get_compress_decompress_pipeline(config.mode, config.num_bits, True)
+
+    @staticmethod
+    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
+        return get_compress_pipeline(config.mode, config.num_bits, True)
+
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type
+                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
+            )
+
+        return filter_func
+
     def transform_model(
         self,
         model: NNCFNetwork,
@@ -332,14 +426,3 @@ def mean_abs_max_statistic_collector(
         collector = TensorCollector(MeanMagnitudeTensorStatistic)
         collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
         return collector
-
-    @staticmethod
-    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
-        def filter_func(point: StatisticPoint) -> bool:
-            return (
-                algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type
-                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
-            )
-
-        return filter_func
diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py
index ed70e8fb3a3..27b7cc5e706 100644
--- a/nncf/torch/engine.py
+++ b/nncf/torch/engine.py
@@ -44,9 +44,10 @@ def infer(
         :param input_data: Inputs for the model.
         :return: Model outputs.
         """
+        with torch.no_grad():
+            if isinstance(input_data, dict):
+                return self._model(**input_data)
+            if isinstance(input_data, tuple):
+                return self._model(*input_data)
 
-        if isinstance(input_data, dict):
-            return self._model(**input_data)
-        if isinstance(input_data, tuple):
-            return self._model(*input_data)
-        return self._model(input_data)
+            return self._model(input_data)
diff --git a/nncf/torch/quantization/layers.py b/nncf/torch/quantization/layers.py
index 13d9655cb81..9bd44b7fc22 100644
--- a/nncf/torch/quantization/layers.py
+++ b/nncf/torch/quantization/layers.py
@@ -1094,7 +1094,7 @@ def __init__(self, scale: torch.Tensor, zero_point: torch.Tensor, result_dtype:
         """
         super().__init__()
         self.register_buffer("_scale", scale.type(dtype=torch.float16))
-        self.register_buffer("_zero_point", self.pack_weight(zero_point))
+        self.register_buffer("_zero_point", self.pack_weight(zero_point.type(dtype=torch.uint8)))
         self.result_dtype = result_dtype
 
     @property
@@ -1165,7 +1165,7 @@ def __init__(
         self.register_buffer("_scale", scale.type(dtype=torch.float16))
 
         self.zero_point_shape = zero_point.shape
-        self.register_buffer("_zero_point", self.pack_weight(zero_point))
+        self.register_buffer("_zero_point", self.pack_weight(zero_point.type(dtype=torch.uint8)))
 
         self.compressed_weight_shape = compressed_weight_shape
         self.result_shape = result_shape

From 035a6688de5b1b2678ff0083d5e39c0b44d57ad3 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 11:12:20 +0100
Subject: [PATCH 18/46] add functions

---
 .../weight_compression/torch_backend.py       | 93 ++++++++-----------
 1 file changed, 38 insertions(+), 55 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 09cd811f206..19fb1c2586c 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -12,7 +12,6 @@
 from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
-import torch.nn as nn
 
 import nncf
 from nncf.common.graph.definitions import NNCFGraphNodeType
@@ -60,75 +59,59 @@
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
 
-class CompressModule(nn.Module):
-    def __init__(self, level_low, level_high):
-        super().__init__()
-        self.level_low = level_low
-        self.level_high = level_high
+def compress(
+    tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int
+) -> torch.Tensor:
+    x = tensor / scale
+    if zero_point is not None:
+        x = x + zero_point
+    x = torch.round(x)
+    x = torch.clamp(x, min=level_low, max=level_high)
+    return x
 
-    def forward(self, tensor, scale, zero_point=None):
-        # Compressed weights: (w / s) + optional zp
-        x = tensor / scale
-        if zero_point is not None:
-            x = x + zero_point
-        x = torch.round(x)
-        x = torch.clamp(x, min=self.level_low, max=self.level_high)
-        return x
 
+def decompress(compressed: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None) -> torch.Tensor:
+    if zero_point is not None:
+        return (compressed - zero_point) * scale
+    return compressed * scale
 
-class CompressDecompressModule(nn.Module):
-    def __init__(self, compress_mod):
-        super().__init__()
-        self.compress_mod = compress_mod
 
-    def forward(self, tensor, scale, zero_point=None):
-        # Step 1: compress
-        clamp_out = self.compress_mod(tensor, scale, zero_point)
+def compress_decompress(
+    tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int
+) -> torch.Tensor:
+    x = compress(tensor, scale, zero_point, level_low, level_high)
+    # Step 2: decompress
+    if zero_point is not None:
+        x = (x - zero_point) * scale
+    else:
+        x = x * scale
+    return x
 
-        # Step 2: decompress
-        if zero_point is not None:
-            out = (clamp_out - zero_point) * scale
-        else:
-            out = clamp_out * scale
-        return out
 
-
-def get_compress_pipeline(mode: CompressWeightsMode, num_bits: int, use_torchscript=False):
+def get_compress_fn(mode, num_bits: int):
     asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    compress_module = CompressModule(level_low, level_high)
-
-    # Optionally compile with TorchScript
-    if use_torchscript:
-        compress_module = torch.jit.script(compress_module)
-
-    def _forward_fn(tensor, scale, zero_point):
+    def _forward_fn(inputs):
+        tensor, scale, zero_point = inputs
         with torch.no_grad():
-            return compress_module(tensor, scale, zero_point)
+            return compress(tensor, scale, zero_point, level_low, level_high)
 
     return _forward_fn
 
 
-def get_compress_decompress_pipeline(mode, num_bits, use_torchscript=False):
-    compress_module = get_compress_pipeline(
-        mode=mode,
-        num_bits=num_bits,
-        use_torchscript=False,  # We'll handle TorchScript in the final module
-    )
-
-    cdc_module = CompressDecompressModule(compress_module)
-
-    # Optionally compile entire compress+decompress pipeline with TorchScript
-    if use_torchscript:
-        cdc_module = torch.jit.script(cdc_module)
+def get_compress_decompress_fn(mode, num_bits: int):
+    asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
+    level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
+    level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    # Return a simple callable
-    def _forward_fn(parameters):
-        w, s, zp = parameters
+    def _forward_fn(inputs):
+        tensor, scale, zero_point = inputs
         with torch.no_grad():
-            return cdc_module(w, s, zp)
+            return compress_decompress(
+                tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high
+            )
 
     return _forward_fn
 
@@ -287,11 +270,11 @@ def insert_adapters(
 
     @staticmethod
     def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        return get_compress_decompress_pipeline(config.mode, config.num_bits, True)
+        return get_compress_decompress_fn(config.mode, config.num_bits)
 
     @staticmethod
     def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
-        return get_compress_pipeline(config.mode, config.num_bits, True)
+        return get_compress_fn(config.mode, config.num_bits)
 
     @staticmethod
     def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:

From 58b992428bb319c747869a0f760c2f6aac36740c Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 11:15:26 +0100
Subject: [PATCH 19/46] upd metrics

---
 tests/post_training/data/wc_reference_data.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 3d27d81ee20..8d47e097953 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -36,7 +36,7 @@ tinyllama_scale_estimation_per_channel_backend_OV:
   num_int4: 188
   num_int8: 124
 tinyllama_scale_estimation_per_channel_backend_TORCH:
-  metric_value: 0.81389
+  metric_value: 0.80799
   num_int4: 188
   num_int8: 124
 tinyllama_data_aware_lora_stateful_backend_OV:

From be3694beebd463e780675a34db2b557d56a431aa Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 14:16:51 +0100
Subject: [PATCH 20/46] rm ov flag

---
 .../algorithms/weight_compression/openvino_backend.py            | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 53cf2c6a03f..fe22b5fdaec 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -374,7 +374,6 @@ def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p
         s = opset.parameter(s_shape, name="s")
         parameters = [w, s]
         compressed_w = w / s
-        compressed_w.get_rt_info()["nonconvertable_divide_0"] = True
         if z_p_shape is not None:
             zp = opset.parameter(z_p_shape, name="zp")
             parameters.append(zp)

From 9345e2fe6277b4b6ffb3e05d856d98566d464b75 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 14:17:56 +0100
Subject: [PATCH 21/46] rm example

---
 .../llm_compression/torch/llama_3_2/main.py   | 74 -------------------
 .../torch/llama_3_2/requirements.txt          |  4 -
 2 files changed, 78 deletions(-)
 delete mode 100644 examples/llm_compression/torch/llama_3_2/main.py
 delete mode 100644 examples/llm_compression/torch/llama_3_2/requirements.txt

diff --git a/examples/llm_compression/torch/llama_3_2/main.py b/examples/llm_compression/torch/llama_3_2/main.py
deleted file mode 100644
index ff03296a47e..00000000000
--- a/examples/llm_compression/torch/llama_3_2/main.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-
-import torch
-from datasets import load_dataset
-from optimum.exporters.openvino.convert import export_from_model
-from optimum.intel.openvino import OVModelForCausalLM
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-import nncf
-
-MODEL_ID = "unsloth/Llama-3.2-1B"
-OUTPUT_DIR = "compressed"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def quantize(model, dataset):
-    quantization_dataset = nncf.Dataset(dataset)
-    compressed_model = nncf.compress_weights(
-        model,
-        dataset=quantization_dataset,
-        mode=nncf.CompressWeightsMode.INT4_SYM,
-        ratio=0.8,
-        sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-    )
-    compressed_model.to("cpu")  # issue with cuda export
-    export_from_model(compressed_model, OUTPUT_DIR, stateful=False, compression_option="fp32", device="cpu")
-
-
-def validate(tokenizer, model):
-    input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
-
-    start_t = time.time()
-    output = model.generate(**input_ids, max_new_tokens=100)
-    print("Elapsed time: ", time.time() - start_t)
-
-    output_text = tokenizer.decode(output[0])
-    print(output_text)
-    return output_text
-
-
-def main():
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, load_in_8bit=False).to(device)
-    model.eval()
-
-    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-    # dataset = dataset.filter(lambda example: len(example["text"]) > 128)  # THIS LEADS TO A WORSE RESULT ON VALIDATION
-
-    def transform_fn(data):
-        tokenized_text = tokenizer(data["text"], return_tensors="pt").to(device)
-        return tokenized_text.data  # NEED TO RETURN ONE OF THE FORMATS of ENGINE (DICT)
-
-    dataset = dataset.map(transform_fn).with_format("torch", device=device)
-
-    quantize(model, dataset)
-    model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
-    )
-    validate(tokenizer, model)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/llm_compression/torch/llama_3_2/requirements.txt b/examples/llm_compression/torch/llama_3_2/requirements.txt
deleted file mode 100644
index e29c588e595..00000000000
--- a/examples/llm_compression/torch/llama_3_2/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-transformers
-datasets==3.2
-openvino==2024.6
-optimum-intel[openvino]

From 6e7d9819537643b2ec621f5329e352807d4b3684 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 15 Jan 2025 14:19:19 +0100
Subject: [PATCH 22/46] rm comments

---
 .../algorithms/weight_compression/scale_estimation.py        | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index b120092f6ca..e0cd43fa29c 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -237,7 +237,7 @@ def calculate_quantization_params(
 
         # normalize importances for every group of weights to make sum of them equal to 1.0
         denum = fns.sum(importance, axis=2, keepdims=True)
-        importance = importance / (denum + eps)  # for each weight in a group
+        importance = importance / (denum + eps)
 
         X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
         q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
@@ -278,7 +278,7 @@ def calculate_quantization_params(
         if zp is not None:
             input_tensors[2] = zp.data
         # iterative rectification of initial scale
-        for i in range(initial_steps):  # make several iteration of updating scale
+        for i in range(initial_steps):
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
             input_tensors[1] = near_to_ideal_scale.data
@@ -406,7 +406,6 @@ def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None
 
 
 def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor:
-    # ideal scale to determine the importance of the weights
     """
     Estimates scales for the given weight, target, zero mask, and importance.
 

From 683cfd490deab2226ad213a6dba536a6d80bcf9f Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Thu, 16 Jan 2025 10:01:57 +0100
Subject: [PATCH 23/46] fix tests

---
 .../weight_compression/scale_estimation.py           |  4 ++--
 .../algorithms/weight_compression/torch_backend.py   | 12 ++++++++++--
 .../weight_compression/torch_fx_backend.py           | 11 +++++++++++
 tests/torch/fx/test_compress_weights.py              |  9 ++++++++-
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index e0cd43fa29c..6fae3ed905f 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -274,9 +274,9 @@ def calculate_quantization_params(
         zero_scale = 0.001
         zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
-        input_tensors = [original_weight.data, None, None]
+        input_tensors = [original_weight.data, None]
         if zp is not None:
-            input_tensors[2] = zp.data
+            input_tensors.append(zp.data)
         # iterative rectification of initial scale
         for i in range(initial_steps):
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 19fb1c2586c..649eb779775 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -94,7 +94,11 @@ def get_compress_fn(mode, num_bits: int):
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
     def _forward_fn(inputs):
-        tensor, scale, zero_point = inputs
+        if len(inputs) == 3:
+            tensor, scale, zero_point = inputs
+        else:
+            tensor, scale = inputs
+            zero_point = None
         with torch.no_grad():
             return compress(tensor, scale, zero_point, level_low, level_high)
 
@@ -107,7 +111,11 @@ def get_compress_decompress_fn(mode, num_bits: int):
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
     def _forward_fn(inputs):
-        tensor, scale, zero_point = inputs
+        if len(inputs) == 3:
+            tensor, scale, zero_point = inputs
+        else:
+            tensor, scale = inputs
+            zero_point = None
         with torch.no_grad():
             return compress_decompress(
                 tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 449957d3ecb..8b14f441669 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -174,6 +174,17 @@ def insert_adapters(
     ) -> None:
         pass
 
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type
+                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
+            )
+
+        return filter_func
+
     def transform_model(
         self,
         model: torch.fx.GraphModule,
diff --git a/tests/torch/fx/test_compress_weights.py b/tests/torch/fx/test_compress_weights.py
index 36b3e575db2..9d5bb0d880d 100644
--- a/tests/torch/fx/test_compress_weights.py
+++ b/tests/torch/fx/test_compress_weights.py
@@ -17,6 +17,7 @@
 import nncf
 from nncf import BackupMode
 from nncf import CompressWeightsMode
+from nncf import SensitivityMetric
 from nncf.common.factory import NNCFGraphFactory
 from nncf.data.dataset import Dataset
 from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
@@ -25,7 +26,6 @@
 from nncf.torch.dynamic_graph.patch_pytorch import disable_patching
 from tests.torch.fx.helpers import get_torch_fx_model
 from tests.torch.ptq.test_weights_compression import ALL_SENSITIVITY_METRICS
-from tests.torch.ptq.test_weights_compression import DATA_BASED_SENSITIVITY_METRICS
 from tests.torch.ptq.test_weights_compression import INT4_MODES
 from tests.torch.ptq.test_weights_compression import INT8_MODES
 from tests.torch.ptq.test_weights_compression import SUPPORTED_MODES
@@ -37,6 +37,13 @@
 from tests.torch.ptq.test_weights_compression import MatMulModel
 from tests.torch.test_models.synthetic import ShortTransformer
 
+DATA_BASED_SENSITIVITY_METRICS = (
+    SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+    SensitivityMetric.MEAN_ACTIVATION_VARIANCE,
+    SensitivityMetric.MAX_ACTIVATION_VARIANCE,
+    SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE,
+)
+
 
 def get_model_size(model):
     param_size = 0

From 1a33369df2b7e6d5dc1ee79fecbb027dc51464c2 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Thu, 16 Jan 2025 11:30:05 +0100
Subject: [PATCH 24/46] reimplement compress/decompress

---
 .../weight_compression/torch_backend.py       | 62 +++++--------------
 1 file changed, 15 insertions(+), 47 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 649eb779775..a20d4934f0c 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -40,7 +40,9 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
 from nncf.torch.dynamic_graph.scope import Scope
@@ -59,67 +61,33 @@
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
 
-def compress(
-    tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int
-) -> torch.Tensor:
-    x = tensor / scale
-    if zero_point is not None:
-        x = x + zero_point
-    x = torch.round(x)
-    x = torch.clamp(x, min=level_low, max=level_high)
-    return x
-
-
-def decompress(compressed: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None) -> torch.Tensor:
-    if zero_point is not None:
-        return (compressed - zero_point) * scale
-    return compressed * scale
-
-
-def compress_decompress(
-    tensor: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor | None, level_low: int, level_high: int
-) -> torch.Tensor:
-    x = compress(tensor, scale, zero_point, level_low, level_high)
-    # Step 2: decompress
-    if zero_point is not None:
-        x = (x - zero_point) * scale
-    else:
-        x = x * scale
-    return x
-
-
-def get_compress_fn(mode, num_bits: int):
-    asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
-    level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
-    level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
-
+def get_compress_fn(config):
     def _forward_fn(inputs):
         if len(inputs) == 3:
             tensor, scale, zero_point = inputs
+            tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point)
         else:
             tensor, scale = inputs
+            tensor, scale = Tensor(tensor), Tensor(scale)
             zero_point = None
-        with torch.no_grad():
-            return compress(tensor, scale, zero_point, level_low, level_high)
+        quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
+        return quantized.data
 
     return _forward_fn
 
 
-def get_compress_decompress_fn(mode, num_bits: int):
-    asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
-    level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
-    level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
-
+def get_compress_decompress_fn(config):
     def _forward_fn(inputs):
         if len(inputs) == 3:
             tensor, scale, zero_point = inputs
+            tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point)
         else:
             tensor, scale = inputs
+            tensor, scale = Tensor(tensor), Tensor(scale)
             zero_point = None
-        with torch.no_grad():
-            return compress_decompress(
-                tensor=tensor, scale=scale, zero_point=zero_point, level_low=level_low, level_high=level_high
-            )
+        quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
+        dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point)
+        return dequantized.data
 
     return _forward_fn
 
@@ -278,11 +246,11 @@ def insert_adapters(
 
     @staticmethod
     def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        return get_compress_decompress_fn(config.mode, config.num_bits)
+        return get_compress_decompress_fn(config)
 
     @staticmethod
     def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
-        return get_compress_fn(config.mode, config.num_bits)
+        return get_compress_fn(config)
 
     @staticmethod
     def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:

From dcf88a5e646fb0204b3130581ef8dc3911f25b7e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Thu, 16 Jan 2025 11:50:11 +0100
Subject: [PATCH 25/46] rm fx

---
 .../weight_compression/mixed_precision.py     |  6 +--
 .../weight_compression/scale_estimation.py    |  6 +--
 .../weight_compression/torch_fx_backend.py    | 51 -------------------
 3 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
index 6aa8a6b3c5e..2bab5e78f35 100644
--- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py
+++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
@@ -211,7 +211,7 @@ class DataBasedCriterion(DataFreeCriterion, ABC):
 
     @property
     def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
+        return [BackendType.OPENVINO, BackendType.TORCH]
 
     def _set_backend_entity(self, model: TModel) -> None:
         model_backend = get_backend(model)
@@ -223,10 +223,6 @@ def _set_backend_entity(self, model: TModel) -> None:
             from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend
 
             self._backend_entity = PTMixedPrecisionAlgoBackend()
-        elif model_backend == BackendType.TORCH_FX:
-            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXMixedPrecisionAlgoBackend
-
-            self._backend_entity = FXMixedPrecisionAlgoBackend()
         else:
             raise nncf.UnsupportedBackendError(
                 "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value)
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 6fae3ed905f..40790f595b9 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -67,7 +67,7 @@ def __init__(
 
     @property
     def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
+        return [BackendType.OPENVINO, BackendType.TORCH]
 
     def _set_backend_entity(self, model: TModel) -> None:
         """
@@ -84,10 +84,6 @@ def _set_backend_entity(self, model: TModel) -> None:
             from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
 
             self._backend_entity = PTWeightCompressionAlgoBackend()
-        elif model_backend == BackendType.TORCH_FX:
-            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
-
-            self._backend_entity = FXWeightCompressionAlgoBackend()
         else:
             raise nncf.UnsupportedBackendError(
                 "Cannot return backend-specific Scale Estimation entity because {} is not supported!".format(
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 8b14f441669..8b57cf5f5c4 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -23,17 +23,10 @@
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.graph.transformations.layout import TransformationLayout
 from nncf.common.tensor_statistics.statistic_point import StatisticPoint
-from nncf.experimental.common.tensor_statistics.collectors import MaxVarianceReducer
-from nncf.experimental.common.tensor_statistics.collectors import MeanAbsMaxReducer
-from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
 from nncf.experimental.common.tensor_statistics.collectors import MeanReducer
-from nncf.experimental.common.tensor_statistics.collectors import MeanVarianceReducer
 from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator
 from nncf.experimental.common.tensor_statistics.collectors import ShapeReducer
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
-from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic
-from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic
-from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand
 from nncf.experimental.torch.fx.model_transformer import FXModelTransformer
@@ -42,7 +35,6 @@
 from nncf.experimental.torch.fx.transformations import constant_update_transformation_builder
 from nncf.experimental.torch.fx.transformations import module_insertion_transformation_builder
 from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -271,46 +263,3 @@ def transform_model(
         transformed_model = FXModelTransformer(model).transform(transformation_layout)
 
         return transformed_model
-
-
-class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompressionAlgoBackend):
-    @staticmethod
-    def mean_variance_statistic_collector(
-        reduction_axes: Tuple[int], subset_size: Optional[int] = None
-    ) -> TensorCollector:
-        reducer = MeanVarianceReducer(reduction_axes)
-        aggregator = MeanAggregator(num_samples=subset_size)
-        collector = TensorCollector(MeanVarianceTensorStatistic)
-        collector.register_statistic_branch(MeanVarianceTensorStatistic.MEAN_VARIANCE_STAT, reducer, aggregator)
-        return collector
-
-    @staticmethod
-    def max_variance_statistic_collector(
-        reduction_axes: Tuple[int], subset_size: Optional[int] = None
-    ) -> TensorCollector:
-        reducer = MaxVarianceReducer(reduction_axes)
-        aggregator = MeanAggregator(num_samples=subset_size)
-        collector = TensorCollector(MaxVarianceTensorStatistic)
-        collector.register_statistic_branch(MaxVarianceTensorStatistic.MAX_VARIANCE_STAT, reducer, aggregator)
-        return collector
-
-    @staticmethod
-    def mean_abs_max_statistic_collector(
-        reduction_axes: Tuple[int], subset_size: Optional[int] = None
-    ) -> TensorCollector:
-        reducer = MeanAbsMaxReducer(reduction_axes)
-        aggregator = MeanAggregator(num_samples=subset_size)
-        collector = TensorCollector(MeanMagnitudeTensorStatistic)
-        collector.register_statistic_branch(MeanMagnitudeTensorStatistic.MEAN_MAGNITUDE_STAT, reducer, aggregator)
-        return collector
-
-    @staticmethod
-    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
-        def filter_func(point: StatisticPoint) -> bool:
-            return (
-                algorithm_key in point.algorithm_to_tensor_collectors
-                and point.target_point.type
-                == PTWeightCompressionAlgoBackend.TARGET_TYPE_TO_PT_INS_TYPE_MAP[TargetType.POST_LAYER_OPERATION]
-            )
-
-        return filter_func

From e48a44b8296105dbb4ab47f5a843f94108b62bef Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 17 Jan 2025 16:22:29 +0100
Subject: [PATCH 26/46] add wc template

---
 nncf/torch/quantization/quantize_functions.py |   2 +
 .../template_test_weights_compression.py      | 122 ++++++++++++++++++
 tests/openvino/native/models.py               |   4 +-
 .../quantization/test_weights_compression.py  | 112 +++++-----------
 tests/torch/ptq/test_weights_compression.py   |  61 +++++++++
 5 files changed, 218 insertions(+), 83 deletions(-)
 create mode 100644 tests/cross_fw/test_templates/template_test_weights_compression.py

diff --git a/nncf/torch/quantization/quantize_functions.py b/nncf/torch/quantization/quantize_functions.py
index 967a02dc256..debb4b5653b 100644
--- a/nncf/torch/quantization/quantize_functions.py
+++ b/nncf/torch/quantization/quantize_functions.py
@@ -292,6 +292,8 @@ def pack_uint4(tensor: torch.Tensor) -> torch.Tensor:
     if tensor.dtype != torch.uint8:
         raise ValidationError(f"Invalid tensor dtype {tensor.type}. torch.uint8 type is supported.")
     packed_tensor = tensor.contiguous()
+    # packed_tensor = packed_tensor.split(2, dim=-1)
+    # packed_tensor = packed_tensor
     packed_tensor = packed_tensor.reshape(-1, 2)
     packed_tensor = torch.bitwise_and(packed_tensor[..., ::2], 15) | packed_tensor[..., 1::2] << 4
     return packed_tensor
diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
new file mode 100644
index 00000000000..4d38e60090e
--- /dev/null
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from abc import ABC
+from abc import abstractmethod
+from typing import TypeVar
+
+import numpy as np
+import pytest
+
+from nncf import CompressWeightsMode
+from nncf import SensitivityMetric
+from nncf.data.dataset import Dataset
+from nncf.quantization import compress_weights
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
+from nncf.tensor import TensorDataType
+
+TTensor = TypeVar("TTensor")
+
+NON_ZERO_ROW = [-4, 1, 2]
+ACTIVATION = [[NON_ZERO_ROW, [0, 0, 0], [0, 0, 0]]]
+MAX_VAR = 3.555555  # np.max(np.var(ACTIVATION, 1))
+MEAN_VAR = 1.555555  # np.mean(np.var(ACTIVATION, 1))
+MEAN_MAX = 2.333333  # np.mean(np.max(np.abs(ACTIVATION), 1))
+HESSIAN_TRACE = (16 + 1 + 4) * 2 / 9  # sum(i*i for i in NON_ZERO_ROW) * 2 / ACTIVATION.size
+MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07
+
+
+class TemplateWeightCompression(ABC):
+    @staticmethod
+    @abstractmethod
+    def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor:
+        pass
+
+    @abstractmethod
+    def get_matmul_model(self):
+        """Returns a model instance."""
+
+    @pytest.mark.parametrize(
+        ("mode", "ref_act_score", "ref_score"),
+        (
+            (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, HESSIAN_TRACE, 0),
+            (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, MEAN_MAX, MEAN_MAX * MAX_BASELINE_SCORE),
+            (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, MEAN_VAR, MEAN_VAR * MAX_BASELINE_SCORE),
+            (SensitivityMetric.MAX_ACTIVATION_VARIANCE, MAX_VAR, MAX_VAR * MAX_BASELINE_SCORE),
+        ),
+    )
+    def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker):
+        model = self.get_matmul_model()
+        data = self.cast_to(self.to_tensor(ACTIVATION), dtype=TensorDataType.float32)
+        dataset = Dataset([data])
+        criterion_cls = MIXED_PRECISION_CRITERIA.get(mode)
+        scores_spy = mocker.spy(criterion_cls, "_calc_sensitivity")
+        act_scores_spy = mocker.spy(criterion_cls, "_calc_activation_sensitivity")
+
+        compress_weights(
+            model,
+            mode=CompressWeightsMode.INT4_ASYM,
+            ratio=0.5,
+            group_size=1,
+            dataset=dataset,
+            sensitivity_metric=mode,
+            all_layers=True,
+        )
+        scores = scores_spy.spy_return
+        act_scores = act_scores_spy.spy_return
+        assert math.isclose(scores[0], ref_score, rel_tol=1e-05, abs_tol=1e-08)
+        assert math.isclose(ref_act_score, act_scores, rel_tol=1e-05, abs_tol=1e-08)
+
+    @abstractmethod
+    def get_sequential_matmul_model(self): ...
+
+    @abstractmethod
+    def to_tensor(): ...
+
+    @abstractmethod
+    def check_weights(self, model, ref_ids): ...
+
+    @pytest.mark.parametrize(
+        ("mode", "all_layers", "ratio", "ref_ids"),
+        (
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]),
+            (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []),
+            (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]),
+            (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]),
+            (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]),
+        ),
+    )
+    def test_mixed_precision(self, mode, all_layers, ratio, ref_ids):
+        model = self.get_sequential_matmul_model()
+        first = self.to_tensor(np.ones([1, 4, 4], dtype=np.float32))
+        second = self.to_tensor(np.arange(16, dtype=np.float32)).reshape(1, 4, 4)
+        dataset = Dataset([first, second])
+        compressed_model = compress_weights(
+            model,
+            mode=CompressWeightsMode.INT4_SYM,
+            ratio=ratio,
+            group_size=1,
+            all_layers=all_layers,
+            sensitivity_metric=mode,
+            dataset=dataset,
+        )
+        self.check_weights(compressed_model, ref_ids)
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index df83d366a9e..bbbde714d39 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -798,12 +798,12 @@ class SequentialMatmulModel(OVReferenceModel):
     """
 
     def _create_ov_model(self):
-        input_node = opset.parameter([1, 3, 3], name="Input_1")
+        input_node = opset.parameter([1, 4, 4], name="Input_1")
         main_values = [10000, 1000, 1, 10, 10000]
 
         last_node = input_node
         for i, main_value in enumerate(main_values):
-            weights_data = np.arange(0, 9).reshape(3, 3)
+            weights_data = np.arange(0, 16).reshape(4, 4)
             weights_data[-1, -1] = main_value
             current_weights = opset.constant(weights_data, dtype=np.float32, name=f"weights_{i}")
             current_node = opset.matmul(
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index e39a621a4a8..a2b4fad8d03 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -47,6 +47,8 @@
 from tests.cross_fw.shared.comparator import compare_stats
 from tests.cross_fw.shared.json import dump_to_json
 from tests.cross_fw.shared.json import load_json
+from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION
+from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression
 from tests.openvino.native.common import get_actual_reference_for_current_openvino
 from tests.openvino.native.models import AWQActMatmulModel
 from tests.openvino.native.models import AWQMatmulModel
@@ -263,46 +265,6 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
     compare_stats(ref_stats, actual_stats)
 
 
-@pytest.mark.parametrize(
-    ("mode", "all_layers", "ratio", "ref_ids"),
-    (
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []),
-        (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]),
-        (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]),
-        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]),
-    ),
-)
-def test_mixed_precision(mode, all_layers, ratio, ref_ids, mocker):
-    model = SequentialMatmulModel().ov_model
-    dataset = Dataset([np.ones([1, 3, 3]), np.arange(9).reshape(1, 3, 3)])
-    compressed_model = compress_weights(
-        model,
-        mode=CompressWeightsMode.NF4,
-        ratio=ratio,
-        group_size=1,
-        all_layers=all_layers,
-        sensitivity_metric=mode,
-        dataset=dataset,
-    )
-    names = {
-        op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.nf4
-    }
-    ref_nf4_nodes = {f"weights_{i}" for i in ref_ids}
-    assert ref_nf4_nodes == names
-
-
 @pytest.mark.parametrize("metric", DATA_BASED_SENSITIVITY_METRICS)
 def test_gather_in_4_bit_if_all_layers_with_data(metric):
     dim1 = 2  # sequence length dimension
@@ -430,46 +392,6 @@ def test_gather_in_8_bit_if_not_all_layers(metric):
         assert node.get_element_type() == ov.Type.u8
 
 
-MAX_BASELINE_SCORE = 1 / np.finfo(np.float32).eps
-NON_ZERO_ROW = [-4, 1, 2]
-ACTIVATION = np.array([[NON_ZERO_ROW, [0, 0, 0], [0, 0, 0]]])
-MAX_VAR = 3.555555  # np.max(np.var(ACTIVATION, 1))
-MEAN_VAR = 1.555555  # np.mean(np.var(ACTIVATION, 1))
-MEAN_MAX = 2.333333  # np.mean(np.max(np.abs(ACTIVATION), 1))
-HESSIAN_TRACE = (16 + 1 + 4) * 2 / 9  # sum(i*i for i in NON_ZERO_ROW) * 2 / ACTIVATION.size
-
-
-@pytest.mark.parametrize(
-    ("mode", "ref_act_scores", "ref_scores"),
-    (
-        (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, HESSIAN_TRACE, 0),
-        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, MEAN_MAX, MEAN_MAX * MAX_BASELINE_SCORE),
-        (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, MEAN_VAR, MEAN_VAR * MAX_BASELINE_SCORE),
-        (SensitivityMetric.MAX_ACTIVATION_VARIANCE, MAX_VAR, MAX_VAR * MAX_BASELINE_SCORE),
-    ),
-)
-def test_data_based_criterion(mode, ref_scores, ref_act_scores, mocker):
-    model = IdentityMatmul().ov_model
-    dataset = Dataset([ACTIVATION])
-    criterion_cls = MIXED_PRECISION_CRITERIA.get(mode)
-    scores_spy = mocker.spy(criterion_cls, "_calc_sensitivity")
-    act_scores_spy = mocker.spy(criterion_cls, "_calc_activation_sensitivity")
-
-    compress_weights(
-        model,
-        mode=CompressWeightsMode.NF4,
-        ratio=0.5,
-        group_size=1,
-        dataset=dataset,
-        sensitivity_metric=mode,
-        all_layers=True,
-    )
-    scores = scores_spy.spy_return
-    act_scores = act_scores_spy.spy_return
-    assert np.allclose(scores, ref_scores)
-    assert np.allclose(act_scores, ref_act_scores)
-
-
 @pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM))
 def test_quantize_Gather_with_multiple_reduction_axes_in_8bit(mode):
     model = GatherWithTwoReductionAxes().ov_model
@@ -1055,7 +977,7 @@ def test_call_gptq(mode):
 )
 def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids):
     model = SequentialMatmulModel().ov_model
-    dataset = Dataset([np.ones([1, 3, 3]), np.arange(9).reshape(3, 3)])
+    dataset = Dataset([np.ones([1, 4, 4]), np.arange(16).reshape(4, 4)])
     compressed_model = compress_weights(
         model,
         mode=CompressWeightsMode.E2M1,
@@ -1596,3 +1518,31 @@ def test_compression_with_transposed_activations(kwargs):
             all_layers=True,
             **kwargs,
         )
+
+
+class TestOVTemplateWeightCompression(TemplateWeightCompression):
+    @staticmethod
+    def get_matmul_model():
+        return IdentityMatmul().ov_model
+
+    @staticmethod
+    def get_sequential_matmul_model():
+        return SequentialMatmulModel().ov_model
+
+    @staticmethod
+    def to_tensor(x):
+        return np.array(x)
+
+    @staticmethod
+    def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray:
+        if dtype is TensorDataType.float32:
+            return x.astype(np.float32)
+        if dtype is TensorDataType.float16:
+            return x.astype(np.float16)
+        raise NotImplementedError
+
+    @staticmethod
+    def check_weights(model, ref_ids):
+        names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4}
+        ref_nf4_nodes = {f"weights_{i}" for i in ref_ids}
+        assert ref_nf4_nodes == names
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 979326cfb36..d9c0dc2b2f2 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -11,6 +11,7 @@
 
 import pytest
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 
 import nncf
@@ -19,6 +20,7 @@
 from nncf import SensitivityMetric
 from nncf.quantization import compress_weights
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
+from nncf.tensor import TensorDataType
 from nncf.torch import wrap_model
 from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import INT4SymmetricWeightsDecompressor
@@ -28,7 +30,9 @@
 from nncf.torch.quantization.quantize_functions import pack_uint4
 from nncf.torch.quantization.quantize_functions import unpack_int4
 from nncf.torch.quantization.quantize_functions import unpack_uint4
+from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression
 from tests.torch.test_models.synthetic import ShortTransformer
+from tests.torch.test_tensor import cast_to
 
 ALL_SENSITIVITY_METRICS = list(SensitivityMetric)
 
@@ -318,3 +322,60 @@ def test_pack_int4():
     assert packed_w.numel() * 2 == w_int8.numel()
     unpacked_w = unpack_int4(packed_w).reshape(w_int8.shape)
     assert torch.all(unpacked_w == w_int8)
+
+
+class IdentityMatmul(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = torch.nn.Parameter(
+            torch.eye(3, dtype=torch.float32) * 255,
+        )
+
+    def forward(self, input):
+        return input @ self.w
+
+
+class SequentialMatmulModel(nn.Module):
+    def __init__(self):
+        super(SequentialMatmulModel, self).__init__()
+        self.main_values = [10000, 1000, 1, 10, 10000]
+        self.layers = nn.ModuleList()
+
+        for _, main_value in enumerate(self.main_values):
+            weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4)
+            weights_data[-1, -1] = main_value
+            weight_tensor = torch.tensor(weights_data)
+            layer = nn.Linear(4, 4, bias=False)
+            layer.weight = nn.Parameter(weight_tensor.t())
+            self.layers.append(layer)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class TestPTTemplateWeightCompression(TemplateWeightCompression):
+    @staticmethod
+    def get_matmul_model():
+        return IdentityMatmul()
+
+    @staticmethod
+    def get_sequential_matmul_model():
+        return SequentialMatmulModel()
+
+    @staticmethod
+    def to_tensor(t):
+        return torch.tensor(t)
+
+    @staticmethod
+    def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor:
+        return cast_to(x, dtype)
+
+    @staticmethod
+    def check_weights(model, ref_ids):
+        for i, op in enumerate(model.layers):
+            if i in ref_ids:
+                assert torch.numel(op.weight) == 8  # workaround to detect uint4 weights
+            else:
+                assert torch.numel(op.weight) == 16

From 63e8c0ad07ff128363f2b3d29fec37831a99ab4a Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 17 Jan 2025 16:31:01 +0100
Subject: [PATCH 27/46] polishing

---
 .../template_test_weights_compression.py      | 20 +++++++++++++------
 .../quantization/test_weights_compression.py  | 12 +++++------
 tests/torch/ptq/test_weights_compression.py   | 10 ++++++----
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index 4d38e60090e..a71d0f7d24a 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -11,7 +11,7 @@
 import math
 from abc import ABC
 from abc import abstractmethod
-from typing import TypeVar
+from typing import List, TypeVar
 
 import numpy as np
 import pytest
@@ -23,6 +23,7 @@
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.tensor import TensorDataType
 
+TModel = TypeVar("TModel")
 TTensor = TypeVar("TTensor")
 
 NON_ZERO_ROW = [-4, 1, 2]
@@ -41,8 +42,10 @@ def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor:
         pass
 
     @abstractmethod
-    def get_matmul_model(self):
-        """Returns a model instance."""
+    def get_matmul_model() -> TModel:
+        """
+        Returns a backend model for test_data_based_criterion.
+        """
 
     @pytest.mark.parametrize(
         ("mode", "ref_act_score", "ref_score"),
@@ -76,13 +79,18 @@ def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker):
         assert math.isclose(ref_act_score, act_scores, rel_tol=1e-05, abs_tol=1e-08)
 
     @abstractmethod
-    def get_sequential_matmul_model(self): ...
+    def get_sequential_matmul_model() -> TModel:
+        """
+        Returns a backend model for test_mixed_precision.
+        """
 
     @abstractmethod
-    def to_tensor(): ...
+    def to_tensor(x: TTensor) -> TTensor:
+        pass
 
     @abstractmethod
-    def check_weights(self, model, ref_ids): ...
+    def check_weights(model: TModel, ref_ids: List[int]) -> None:
+        """Checks that only weights with specified ids are compressed in int4 format."""
 
     @pytest.mark.parametrize(
         ("mode", "all_layers", "ratio", "ref_ids"),
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index a2b4fad8d03..aedfc1d0573 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -874,7 +874,7 @@ def test_number_of_reduced_statistics_for_subset_size(
     mocker, dataset_size, subset_size, ref_size, compression_args, multiplier_of_calls
 ):
     model = IdentityMatmul().ov_model
-    dataset = Dataset([ACTIVATION] * dataset_size)
+    dataset = Dataset([np.array(ACTIVATION)] * dataset_size)
     stats_spy = mocker.spy(AggregatorBase, "register_reduced_input")
 
     compress_weights(model, dataset=dataset, subset_size=subset_size, **compression_args)
@@ -890,7 +890,7 @@ def test_default_subset_value():
 @pytest.mark.parametrize("subset_size", (-1, 0))
 def test_invalid_subset_size(subset_size):
     model = IdentityMatmul().ov_model
-    dataset = Dataset([ACTIVATION])
+    dataset = Dataset([np.array(ACTIVATION)])
     with pytest.raises(nncf.ValidationError):
         compress_weights(model, mode=CompressWeightsMode.INT4_ASYM, ratio=0.5, dataset=dataset, subset_size=subset_size)
 
@@ -1522,15 +1522,15 @@ def test_compression_with_transposed_activations(kwargs):
 
 class TestOVTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
-    def get_matmul_model():
+    def get_matmul_model() -> ov.Model:
         return IdentityMatmul().ov_model
 
     @staticmethod
-    def get_sequential_matmul_model():
+    def get_sequential_matmul_model() -> ov.Model:
         return SequentialMatmulModel().ov_model
 
     @staticmethod
-    def to_tensor(x):
+    def to_tensor(x) -> np.ndarray:
         return np.array(x)
 
     @staticmethod
@@ -1542,7 +1542,7 @@ def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray:
         raise NotImplementedError
 
     @staticmethod
-    def check_weights(model, ref_ids):
+    def check_weights(model: ov.Model, ref_ids: List[int]) -> None:
         names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4}
         ref_nf4_nodes = {f"weights_{i}" for i in ref_ids}
         assert ref_nf4_nodes == names
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index d9c0dc2b2f2..06047849ba1 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -9,6 +9,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List
+
 import pytest
 import torch
 import torch.nn as nn
@@ -357,15 +359,15 @@ def forward(self, x):
 
 class TestPTTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
-    def get_matmul_model():
+    def get_matmul_model() -> torch.nn.Module:
         return IdentityMatmul()
 
     @staticmethod
-    def get_sequential_matmul_model():
+    def get_sequential_matmul_model() -> torch.nn.Module:
         return SequentialMatmulModel()
 
     @staticmethod
-    def to_tensor(t):
+    def to_tensor(t) -> torch.Tensor:
         return torch.tensor(t)
 
     @staticmethod
@@ -373,7 +375,7 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor:
         return cast_to(x, dtype)
 
     @staticmethod
-    def check_weights(model, ref_ids):
+    def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
         for i, op in enumerate(model.layers):
             if i in ref_ids:
                 assert torch.numel(op.weight) == 8  # workaround to detect uint4 weights

From b2fef75fa5e071ef7a931da51a1839880bd0a631 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 17 Jan 2025 16:39:29 +0100
Subject: [PATCH 28/46] comment

---
 .../test_reducers_and_aggregators.py          | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py
index 7d60f0fc01a..fd1b959732d 100644
--- a/tests/common/experimental/test_reducers_and_aggregators.py
+++ b/tests/common/experimental/test_reducers_and_aggregators.py
@@ -55,6 +55,7 @@
 
 NO_OUTLIERS_DEFAULT_3D_MEDIAN_VALUE = [[5.0, 4.0, 15.0], [8.0, 25.0, 12.0], [35.0, 16.0, 45.0]]
 
+WEIGHT_COMPRESSION_REDUCERS_DATA = [[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]
 
 default_test_quantile = 0.1
 
@@ -238,10 +239,10 @@ def test_quantile_reducers(self, reducer_name, ref, reducers):
     @pytest.mark.parametrize(
         "axes, np_data, reference",
         [
-            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
-            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 14.25],
-            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 15.875],
-            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
+            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 14.25],
+            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 15.875],
+            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
         ],
     )
     def test_mean_variance_reducer(self, axes, np_data, reference):
@@ -254,10 +255,10 @@ def test_mean_variance_reducer(self, axes, np_data, reference):
     @pytest.mark.parametrize(
         "axes, np_data, reference",
         [
-            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0],
-            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 4.16666],
-            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 6.33333],
-            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 10.0],
+            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0],
+            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 4.16666],
+            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 6.33333],
+            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0],
         ],
     )
     def test_mean_abs_max_reducer(self, axes, np_data, reference):
@@ -270,10 +271,10 @@ def test_mean_abs_max_reducer(self, axes, np_data, reference):
     @pytest.mark.parametrize(
         "axes, np_data, reference",
         [
-            [None, np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
-            [(0,), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 64.0],
-            [(0, 1), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 36.1875],
-            [(0, 1, 2), np.array([[[1, 2, 0], [1, -3, 10]], [[-1, 2, -3], [4, 5, -6]]]), 16.1666],
+            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
+            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 64.0],
+            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 36.1875],
+            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
         ],
     )
     def test_max_variance_reducer(self, axes, np_data, reference):
@@ -582,8 +583,7 @@ def test_reducers_name_hash_equal(self, reducer_name, reducers):
             params["channel_axis"] = [1, 2]
         else:
             raise nncf.ValidationError(
-                "test_min_max_mean_reducer_hash_equal configurated in a wrong way."
-                f" Wrong reducer_name: {reducer_name}"
+                f"test_min_max_mean_reducer_hash_equal configurated in a wrong way. Wrong reducer_name: {reducer_name}"
             )
 
         def product_dict(**kwargs):

From 32788a4f46b154104dbc0f07457ace7f70d6abd9 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 17 Jan 2025 17:18:29 +0100
Subject: [PATCH 29/46] comments

---
 nncf/torch/quantization/quantize_functions.py |  2 -
 .../test_reducers_and_aggregators.py          | 39 ++++++-------------
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/nncf/torch/quantization/quantize_functions.py b/nncf/torch/quantization/quantize_functions.py
index debb4b5653b..967a02dc256 100644
--- a/nncf/torch/quantization/quantize_functions.py
+++ b/nncf/torch/quantization/quantize_functions.py
@@ -292,8 +292,6 @@ def pack_uint4(tensor: torch.Tensor) -> torch.Tensor:
     if tensor.dtype != torch.uint8:
         raise ValidationError(f"Invalid tensor dtype {tensor.type}. torch.uint8 type is supported.")
     packed_tensor = tensor.contiguous()
-    # packed_tensor = packed_tensor.split(2, dim=-1)
-    # packed_tensor = packed_tensor
     packed_tensor = packed_tensor.reshape(-1, 2)
     packed_tensor = torch.bitwise_and(packed_tensor[..., ::2], 15) | packed_tensor[..., 1::2] << 4
     return packed_tensor
diff --git a/tests/common/experimental/test_reducers_and_aggregators.py b/tests/common/experimental/test_reducers_and_aggregators.py
index fd1b959732d..334640f996e 100644
--- a/tests/common/experimental/test_reducers_and_aggregators.py
+++ b/tests/common/experimental/test_reducers_and_aggregators.py
@@ -237,49 +237,34 @@ def test_quantile_reducers(self, reducer_name, ref, reducers):
             assert fns.allclose(val[i], self.get_nncf_tensor(ref_))
 
     @pytest.mark.parametrize(
-        "axes, np_data, reference",
-        [
-            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
-            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 14.25],
-            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 15.875],
-            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
-        ],
+        "axes, reference",
+        [[None, 16.1666], [(0,), 14.25], [(0, 1), 15.875], [(0, 1, 2), 16.1666]],
     )
-    def test_mean_variance_reducer(self, axes, np_data, reference):
+    def test_mean_variance_reducer(self, axes, reference):
         reducer = MeanVarianceReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
+        nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))
 
     @pytest.mark.parametrize(
-        "axes, np_data, reference",
-        [
-            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0],
-            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 4.16666],
-            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 6.33333],
-            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 10.0],
-        ],
+        "axes, reference",
+        [[None, 10.0], [(0,), 4.16666], [(0, 1), 6.33333], [(0, 1, 2), 10.0]],
     )
-    def test_mean_abs_max_reducer(self, axes, np_data, reference):
+    def test_mean_abs_max_reducer(self, axes, reference):
         reducer = MeanAbsMaxReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
+        nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))
 
     @pytest.mark.parametrize(
-        "axes, np_data, reference",
-        [
-            [None, np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
-            [(0,), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 64.0],
-            [(0, 1), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 36.1875],
-            [(0, 1, 2), np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), 16.1666],
-        ],
+        "axes, reference",
+        [[None, 16.1666], [(0,), 64.0], [(0, 1), 36.1875], [(0, 1, 2), 16.1666]],
     )
-    def test_max_variance_reducer(self, axes, np_data, reference):
+    def test_max_variance_reducer(self, axes, reference):
         reducer = MaxVarianceReducer(reduction_axes=axes)
-        nncf_data = self.get_nncf_tensor(np_data, dtype=Dtype.FLOAT)
+        nncf_data = self.get_nncf_tensor(np.array(WEIGHT_COMPRESSION_REDUCERS_DATA), dtype=Dtype.FLOAT)
         result = reducer._reduce_out_of_place([nncf_data])
         assert len(result) == 1
         assert fns.allclose(result[0], self.get_nncf_tensor(reference))

From 9d0acdbd2685c88d3f1caf2804e2d57e9762857e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Mon, 20 Jan 2025 11:26:42 +0100
Subject: [PATCH 30/46] rollback no_grad

---
 .../weight_compression/torch_backend.py       | 33 +++++++++----------
 nncf/torch/engine.py                          | 11 +++----
 .../post_training/data/wc_reference_data.yaml |  2 +-
 3 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index a20d4934f0c..518be41721d 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -61,30 +61,27 @@
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
 
-def get_compress_fn(config):
-    def _forward_fn(inputs):
-        if len(inputs) == 3:
-            tensor, scale, zero_point = inputs
-            tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point)
-        else:
-            tensor, scale = inputs
-            tensor, scale = Tensor(tensor), Tensor(scale)
-            zero_point = None
+def _prepare_inputs(
+    tensor: torch.Tensor, scale: torch.Tensor, zero_point=Optional[torch.Tensor]
+) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    tensor, scale = Tensor(tensor), Tensor(scale)
+    if zero_point is not None:
+        zero_point = Tensor(zero_point)
+    return tensor, scale, zero_point
+
+
+def get_compress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]:
+    def _forward_fn(inputs: Tuple) -> Tensor:
+        tensor, scale, zero_point = _prepare_inputs(*inputs)
         quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
         return quantized.data
 
     return _forward_fn
 
 
-def get_compress_decompress_fn(config):
-    def _forward_fn(inputs):
-        if len(inputs) == 3:
-            tensor, scale, zero_point = inputs
-            tensor, scale, zero_point = Tensor(tensor), Tensor(scale), Tensor(zero_point)
-        else:
-            tensor, scale = inputs
-            tensor, scale = Tensor(tensor), Tensor(scale)
-            zero_point = None
+def get_compress_decompress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]:
+    def _forward_fn(inputs: Tuple) -> Tensor:
+        tensor, scale, zero_point = _prepare_inputs(*inputs)
         quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
         dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point)
         return dequantized.data
diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py
index 27b7cc5e706..239c6857c24 100644
--- a/nncf/torch/engine.py
+++ b/nncf/torch/engine.py
@@ -44,10 +44,9 @@ def infer(
         :param input_data: Inputs for the model.
         :return: Model outputs.
         """
-        with torch.no_grad():
-            if isinstance(input_data, dict):
-                return self._model(**input_data)
-            if isinstance(input_data, tuple):
-                return self._model(*input_data)
+        if isinstance(input_data, dict):
+            return self._model(**input_data)
+        if isinstance(input_data, tuple):
+            return self._model(*input_data)
 
-            return self._model(input_data)
+        return self._model(input_data)
diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 8d47e097953..3d27d81ee20 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -36,7 +36,7 @@ tinyllama_scale_estimation_per_channel_backend_OV:
   num_int4: 188
   num_int8: 124
 tinyllama_scale_estimation_per_channel_backend_TORCH:
-  metric_value: 0.80799
+  metric_value: 0.81389
   num_int4: 188
   num_int8: 124
 tinyllama_data_aware_lora_stateful_backend_OV:

From 37a41ac6feb4ffb768933214e1312e2ac624570e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Mon, 20 Jan 2025 12:59:29 +0100
Subject: [PATCH 31/46] add torch.no_grad()

---
 .../torch/fx/quantization/quantize_model.py   | 40 +++++++++----------
 nncf/torch/quantization/quantize_model.py     | 36 ++++++++---------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/nncf/experimental/torch/fx/quantization/quantize_model.py b/nncf/experimental/torch/fx/quantization/quantize_model.py
index 3d5c64dfccd..ef2aac355ca 100644
--- a/nncf/experimental/torch/fx/quantization/quantize_model.py
+++ b/nncf/experimental/torch/fx/quantization/quantize_model.py
@@ -135,25 +135,25 @@ def compress_weights_impl(
     """
     Implementation of the `compress_weights()` method for the Torch Fx backend.
     """
-
-    compression_algorithm = WeightCompression(
-        mode,
-        ratio,
-        group_size,
-        ignored_scope,
-        all_layers,
-        sensitivity_metric,
-        awq,
-        subset_size,
-        scale_estimation,
-        gptq,
-        lora_correction,
-        backup_mode,
-        advanced_parameters,
-    )
-    graph = NNCFGraphFactory.create(model)
-    compressed_model = compression_algorithm.apply(model, graph, dataset=dataset)
-    compressed_model = GraphModule(compressed_model, compressed_model.graph)
-    compressed_model = _disallow_eval_train(compressed_model)
+    with torch.no_grad():
+        compression_algorithm = WeightCompression(
+            mode,
+            ratio,
+            group_size,
+            ignored_scope,
+            all_layers,
+            sensitivity_metric,
+            awq,
+            subset_size,
+            scale_estimation,
+            gptq,
+            lora_correction,
+            backup_mode,
+            advanced_parameters,
+        )
+        graph = NNCFGraphFactory.create(model)
+        compressed_model = compression_algorithm.apply(model, graph, dataset=dataset)
+        compressed_model = GraphModule(compressed_model, compressed_model.graph)
+        compressed_model = _disallow_eval_train(compressed_model)
 
     return compressed_model
diff --git a/nncf/torch/quantization/quantize_model.py b/nncf/torch/quantization/quantize_model.py
index 3e5c9af0ce4..ddc97c13d90 100644
--- a/nncf/torch/quantization/quantize_model.py
+++ b/nncf/torch/quantization/quantize_model.py
@@ -101,21 +101,21 @@ def compress_weights_impl(
     """
     Implementation of the `compress_weights()` method for the PyTorch backend.
     """
-
-    compression_algorithm = WeightCompression(
-        mode,
-        ratio,
-        group_size,
-        ignored_scope,
-        all_layers,
-        sensitivity_metric,
-        awq,
-        subset_size,
-        scale_estimation,
-        gptq,
-        lora_correction,
-        backup_mode,
-        advanced_parameters,
-    )
-    graph = NNCFGraphFactory.create(model)
-    return compression_algorithm.apply(model, graph, dataset=dataset)
+    with torch.no_grad():
+        compression_algorithm = WeightCompression(
+            mode,
+            ratio,
+            group_size,
+            ignored_scope,
+            all_layers,
+            sensitivity_metric,
+            awq,
+            subset_size,
+            scale_estimation,
+            gptq,
+            lora_correction,
+            backup_mode,
+            advanced_parameters,
+        )
+        graph = NNCFGraphFactory.create(model)
+        return compression_algorithm.apply(model, graph, dataset=dataset)

From a305fac031370bab353dd7a70f61dce33827135c Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Mon, 20 Jan 2025 14:34:29 +0100
Subject: [PATCH 32/46] start of cuda in conformance

---
 .../pipelines/lm_weight_compression.py             | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index b72e9fb632e..b9ce4eca3e2 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -85,7 +85,9 @@ def prepare_model(self) -> None:
                 raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.")
 
             self.model_hf = AutoModelForCausalLM.from_pretrained(
-                self.model_id, torch_dtype=torch.float32, device_map="cpu"
+                self.model_id,
+                torch_dtype=torch.float32,
+                device_map="cpu",  # TODO (kshpv): add support of 'cuda', when supported
             )
             self.model = self.model_hf
         elif self.backend == BackendType.OV:
@@ -157,7 +159,7 @@ def transform_fn(data, max_tokens=128, filter_bad_tokens=True):
                     inputs[name] = np.zeros(shape)
             if self.backend == BackendType.TORCH:
                 for input_name in inputs:
-                    inputs[input_name] = torch.from_numpy(inputs[input_name])
+                    inputs[input_name] = torch.from_numpy(inputs[input_name]).to(self.model_hf.device)
             return inputs
 
         return transform_fn
@@ -209,7 +211,13 @@ def save_compressed_model(self) -> None:
             ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
             self.model_hf._save_config(self.output_model_dir)
         elif self.backend == BackendType.TORCH:
-            export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")
+            export_from_model(
+                self.model_hf,
+                self.output_model_dir,
+                stateful=False,
+                compression_option="fp32",
+                device=self.model_hf.device,
+            )
 
     def get_num_compressed(self) -> None:
         """

From ddee49532fdf5705fe5d1e6416621c7d8b2d375f Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 21 Jan 2025 11:53:40 +0100
Subject: [PATCH 33/46] add scale estimation test

---
 .../template_test_weights_compression.py      |  51 ++++++--
 tests/openvino/native/models.py               |  15 +++
 .../quantization/test_weights_compression.py  |  44 +++++++
 .../fx/test_weights_compression_backends.py   |  38 ++++++
 tests/torch/ptq/test_weights_compression.py   | 110 ++++++++++++------
 5 files changed, 216 insertions(+), 42 deletions(-)
 create mode 100644 tests/torch/fx/test_weights_compression_backends.py

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index a71d0f7d24a..d5bdf386300 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -16,11 +16,14 @@
 import numpy as np
 import pytest
 
+import nncf.tensor.functions as fns
 from nncf import CompressWeightsMode
 from nncf import SensitivityMetric
 from nncf.data.dataset import Dataset
 from nncf.quantization import compress_weights
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
+from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
+from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
 TModel = TypeVar("TModel")
@@ -39,13 +42,11 @@ class TemplateWeightCompression(ABC):
     @staticmethod
     @abstractmethod
     def cast_to(x: TTensor, dtype: TensorDataType) -> TTensor:
-        pass
+        """Casts a backend tensor to backend tensor with specified dtype."""
 
     @abstractmethod
     def get_matmul_model() -> TModel:
-        """
-        Returns a backend model for test_data_based_criterion.
-        """
+        """Returns a backend model for test_data_based_criterion."""
 
     @pytest.mark.parametrize(
         ("mode", "ref_act_score", "ref_score"),
@@ -80,13 +81,11 @@ def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker):
 
     @abstractmethod
     def get_sequential_matmul_model() -> TModel:
-        """
-        Returns a backend model for test_mixed_precision.
-        """
+        """Returns a backend model for test_mixed_precision."""
 
     @abstractmethod
     def to_tensor(x: TTensor) -> TTensor:
-        pass
+        """Returns a backend tensor."""
 
     @abstractmethod
     def check_weights(model: TModel, ref_ids: List[int]) -> None:
@@ -128,3 +127,39 @@ def test_mixed_precision(self, mode, all_layers, ratio, ref_ids):
             dataset=dataset,
         )
         self.check_weights(compressed_model, ref_ids)
+
+    @staticmethod
+    @abstractmethod
+    def get_model_for_test_scale_estimation():
+        """
+        Returns a backend model for test_scale_estimation.
+        """
+
+    @staticmethod
+    @abstractmethod
+    def get_scale_estimation_ref():
+        """
+        Returns the reference output of calculate_quantization_params of ScaleEstimation.
+        """
+
+    def test_scale_estimation(self, mocker):
+        calc_q_params_spy = mocker.spy(ScaleEstimation, "calculate_quantization_params")
+        model = self.get_model_for_test_scale_estimation()
+
+        # prepare dataset with one input tensor
+        input = np.arange(0, 32 * 32, dtype=np.float32).reshape(1, 32, 32)
+        input[0, 15] *= 100  # make one channel relatively higher.
+        input = self.to_tensor(input)
+        dataset = Dataset([input])
+
+        _ = compress_weights(
+            model,
+            mode=CompressWeightsMode.INT4_ASYM,
+            ratio=1.0,
+            group_size=32,
+            scale_estimation=True,
+            all_layers=True,
+            dataset=dataset,
+        )
+        reference = self.get_scale_estimation_ref()
+        assert fns.allclose(Tensor(reference), calc_q_params_spy.spy_return[0])
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index bbbde714d39..56ef047f97f 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1185,3 +1185,18 @@ def _create_ov_model(self):
 
         model = ov.Model([sin_result, cos_result], [position_ids])
         return model
+
+
+class MLP(OVReferenceModel):
+    def _create_ov_model(self):
+        input_node = opset.parameter([1, 32, 32], name="Input")
+
+        weights_data = np.arange(0, 32 * 32, dtype=np.float32).reshape(32, 32)
+        weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights")
+
+        matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul")
+
+        result_node = opset.result(matmul_node, name="Result")
+
+        model = ov.Model([result_node], [input_node], name="MLP_Model")
+        return model
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index aedfc1d0573..e0d072cb04a 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -50,6 +50,7 @@
 from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION
 from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression
 from tests.openvino.native.common import get_actual_reference_for_current_openvino
+from tests.openvino.native.models import MLP
 from tests.openvino.native.models import AWQActMatmulModel
 from tests.openvino.native.models import AWQMatmulModel
 from tests.openvino.native.models import GatherAndMatmulShareData
@@ -1546,3 +1547,46 @@ def check_weights(model: ov.Model, ref_ids: List[int]) -> None:
         names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4}
         ref_nf4_nodes = {f"weights_{i}" for i in ref_ids}
         assert ref_nf4_nodes == names
+
+    @staticmethod
+    def get_model_for_test_scale_estimation():
+        return MLP().ov_model
+
+    @staticmethod
+    def get_scale_estimation_ref():
+        return np.array(
+            [
+                [[2.0666666]],
+                [[3.7624273]],
+                [[5.884783]],
+                [[8.03606]],
+                [[10.136832]],
+                [[12.291862]],
+                [[14.34415]],
+                [[16.449669]],
+                [[18.608639]],
+                [[20.802698]],
+                [[22.9477]],
+                [[25.083504]],
+                [[27.152409]],
+                [[29.141987]],
+                [[31.171442]],
+                [[33.044716]],
+                [[35.178047]],
+                [[37.31138]],
+                [[39.444714]],
+                [[41.578045]],
+                [[43.71138]],
+                [[45.844715]],
+                [[47.978046]],
+                [[50.11138]],
+                [[52.244713]],
+                [[54.378044]],
+                [[56.511383]],
+                [[58.644714]],
+                [[60.77805]],
+                [[62.91138]],
+                [[65.044716]],
+                [[67.17805]],
+            ]
+        )
diff --git a/tests/torch/fx/test_weights_compression_backends.py b/tests/torch/fx/test_weights_compression_backends.py
new file mode 100644
index 00000000000..bd66093ec19
--- /dev/null
+++ b/tests/torch/fx/test_weights_compression_backends.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nncf.quantization.algorithms.weight_compression.mixed_precision import HAWQCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MaxVarianceCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanMaxCriterion
+from nncf.quantization.algorithms.weight_compression.mixed_precision import MeanVarianceCriterion
+from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend
+from tests.cross_fw.test_templates.test_weights_compression_backends import TemplateTestMixedPrecisionAlgoBackend
+
+
+class TestPTMixedPrecisionAlgoBackend(TemplateTestMixedPrecisionAlgoBackend):
+    def get_hawq_with_backend(self, subset_size):
+        hawq = HAWQCriterion(None, None, subset_size=subset_size)
+        hawq._backend_entity = PTMixedPrecisionAlgoBackend()
+        return hawq
+
+    def get_mean_variance_with_backend(self, subset_size: int):
+        mean_variance = MeanVarianceCriterion(None, None, subset_size=subset_size)
+        mean_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return mean_variance
+
+    def get_max_variance_with_backend(self, subset_size: int):
+        max_variance = MaxVarianceCriterion(None, None, subset_size=subset_size)
+        max_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return max_variance
+
+    def get_mean_max_with_backend(self, subset_size: int):
+        mean_max_variance = MeanMaxCriterion(None, None, subset_size=subset_size)
+        mean_max_variance._backend_entity = PTMixedPrecisionAlgoBackend()
+        return mean_max_variance
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 06047849ba1..cee43f783d0 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -44,15 +44,45 @@
 UNSUPPORTED_MODES = (CompressWeightsMode.NF4, CompressWeightsMode.E2M1)
 
 
-class MatMulModel(torch.nn.Module):
+class SequentialMatmulModel(nn.Module):
     def __init__(self):
+        super(SequentialMatmulModel, self).__init__()
+        self.main_values = [10000, 1000, 1, 10, 10000]
+        self.layers = nn.ModuleList()
+
+        for _, main_value in enumerate(self.main_values):
+            weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4)
+            weights_data[-1, -1] = main_value
+            weight_tensor = torch.tensor(weights_data)
+            layer = nn.Linear(4, 4, bias=False)
+            layer.weight = nn.Parameter(weight_tensor.t())
+            self.layers.append(layer)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MatMulModel(torch.nn.Module):
+    def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)):
         super().__init__()
-        self.w = torch.nn.Parameter(torch.ones(size=(256, 256), dtype=torch.float32))
+        self.w = torch.nn.Parameter(weight)
 
     def forward(self, input):
         return input @ self.w
 
 
+class LinearModel(torch.nn.Module):
+    def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)):
+        super().__init__()
+        self.linear = torch.nn.Linear(weight.shape[0], weight.shape[1], False)
+        self.linear.weight = torch.nn.Parameter(weight)
+
+    def forward(self, input):
+        return self.linear(input)
+
+
 class FunctionalModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -326,41 +356,10 @@ def test_pack_int4():
     assert torch.all(unpacked_w == w_int8)
 
 
-class IdentityMatmul(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.w = torch.nn.Parameter(
-            torch.eye(3, dtype=torch.float32) * 255,
-        )
-
-    def forward(self, input):
-        return input @ self.w
-
-
-class SequentialMatmulModel(nn.Module):
-    def __init__(self):
-        super(SequentialMatmulModel, self).__init__()
-        self.main_values = [10000, 1000, 1, 10, 10000]
-        self.layers = nn.ModuleList()
-
-        for _, main_value in enumerate(self.main_values):
-            weights_data = torch.arange(0, 16, dtype=torch.float32).reshape(4, 4)
-            weights_data[-1, -1] = main_value
-            weight_tensor = torch.tensor(weights_data)
-            layer = nn.Linear(4, 4, bias=False)
-            layer.weight = nn.Parameter(weight_tensor.t())
-            self.layers.append(layer)
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
 class TestPTTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
     def get_matmul_model() -> torch.nn.Module:
-        return IdentityMatmul()
+        return MatMulModel(255 * torch.eye(3, dtype=torch.float32))
 
     @staticmethod
     def get_sequential_matmul_model() -> torch.nn.Module:
@@ -381,3 +380,46 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
                 assert torch.numel(op.weight) == 8  # workaround to detect uint4 weights
             else:
                 assert torch.numel(op.weight) == 16
+
+    @staticmethod
+    def get_model_for_test_scale_estimation():
+        return LinearModel(torch.arange(0, 32 * 32, dtype=torch.float32).reshape(32, 32))
+
+    @staticmethod
+    def get_scale_estimation_ref():
+        return torch.tensor(
+            [
+                [[2.0666666]],
+                [[3.7624271]],
+                [[5.8847833]],
+                [[8.0360603]],
+                [[10.1368332]],
+                [[12.2918606]],
+                [[14.3441496]],
+                [[16.4496689]],
+                [[18.6086369]],
+                [[20.8027000]],
+                [[22.9477024]],
+                [[25.0835018]],
+                [[27.1524105]],
+                [[29.1419849]],
+                [[31.1714401]],
+                [[33.0447121]],
+                [[35.1780472]],
+                [[37.3113823]],
+                [[39.4447136]],
+                [[41.5780487]],
+                [[43.7113838]],
+                [[45.8447189]],
+                [[47.9780464]],
+                [[50.1113815]],
+                [[52.2447128]],
+                [[54.3780441]],
+                [[56.5113831]],
+                [[58.6447144]],
+                [[60.7780533]],
+                [[62.9113808]],
+                [[65.0447083]],
+                [[67.1780548]],
+            ]
+        )

From f89ae9d7ede3174122bd0deb9df832b1b7cc4578 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 21 Jan 2025 11:55:45 +0100
Subject: [PATCH 34/46] upd year

---
 tests/torch/fx/test_weights_compression_backends.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/torch/fx/test_weights_compression_backends.py b/tests/torch/fx/test_weights_compression_backends.py
index bd66093ec19..fccdd0e2c01 100644
--- a/tests/torch/fx/test_weights_compression_backends.py
+++ b/tests/torch/fx/test_weights_compression_backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at

From 026a0ed8ff57efefc94f162d5d8d56c9162445fc Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 21 Jan 2025 14:07:30 +0100
Subject: [PATCH 35/46] add tinyllama_scale_estimation_group_size_64

---
 tests/post_training/data/wc_reference_data.yaml |  8 ++++++++
 tests/post_training/model_scope.py              | 15 +++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 3d27d81ee20..1309dbbc13c 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -56,3 +56,11 @@ tinyllama_int4_data_free_backend_TORCH:
   metric_value: 0.73873
   num_int4: 114
   num_int8: 84
+tinyllama_scale_estimation_group_size_64_backend_OV:
+  metric_value: 0.8566
+  num_int4: 94
+  num_int8: 124
+tinyllama_scale_estimation_group_size_64_backend_TORCH:
+  metric_value: 0.8566
+  num_int4: 94
+  num_int8: 124
\ No newline at end of file
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
index ad2dd36757d..b9e07c77fab 100644
--- a/tests/post_training/model_scope.py
+++ b/tests/post_training/model_scope.py
@@ -531,6 +531,21 @@
         },
         "backends": [BackendType.OV],
     },
+    {
+        "reported_name": "tinyllama_scale_estimation_group_size_64",
+        "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
+        "pipeline_cls": LMWeightCompression,
+        "compression_params": {
+            "group_size": 64,
+            "ratio": 0.8,
+            "mode": CompressWeightsMode.INT4_SYM,
+            "scale_estimation": True,
+            "advanced_parameters": AdvancedCompressionParameters(
+                scale_estimation_params=AdvancedScaleEstimationParameters(32, 5, 10, 1.0)
+            ),
+        },
+        "backends": [BackendType.OV, BackendType.TORCH],
+    },
 ]
 
 

From e3f12c216b0d8f78d409bd37a0612deb97f9a0b6 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 21 Jan 2025 14:10:11 +0100
Subject: [PATCH 36/46] torch.no_grad -> torch.inference_mode

---
 nncf/experimental/torch/fx/quantization/quantize_model.py       | 2 +-
 .../quantization/algorithms/weight_compression/torch_backend.py | 2 +-
 nncf/torch/quantization/quantize_model.py                       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nncf/experimental/torch/fx/quantization/quantize_model.py b/nncf/experimental/torch/fx/quantization/quantize_model.py
index ef2aac355ca..c30f653ce6e 100644
--- a/nncf/experimental/torch/fx/quantization/quantize_model.py
+++ b/nncf/experimental/torch/fx/quantization/quantize_model.py
@@ -135,7 +135,7 @@ def compress_weights_impl(
     """
     Implementation of the `compress_weights()` method for the Torch Fx backend.
     """
-    with torch.no_grad():
+    with torch.inference_mode():
         compression_algorithm = WeightCompression(
             mode,
             ratio,
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 518be41721d..1f843679737 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -62,7 +62,7 @@
 
 
 def _prepare_inputs(
-    tensor: torch.Tensor, scale: torch.Tensor, zero_point=Optional[torch.Tensor]
+    tensor: torch.Tensor, scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None
 ) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
     tensor, scale = Tensor(tensor), Tensor(scale)
     if zero_point is not None:
diff --git a/nncf/torch/quantization/quantize_model.py b/nncf/torch/quantization/quantize_model.py
index ddc97c13d90..57408abff31 100644
--- a/nncf/torch/quantization/quantize_model.py
+++ b/nncf/torch/quantization/quantize_model.py
@@ -101,7 +101,7 @@ def compress_weights_impl(
     """
     Implementation of the `compress_weights()` method for the PyTorch backend.
     """
-    with torch.no_grad():
+    with torch.inference_mode():
         compression_algorithm = WeightCompression(
             mode,
             ratio,

From a347a2598d063822a0deb98fe369468792433f7d Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Tue, 21 Jan 2025 14:50:20 +0100
Subject: [PATCH 37/46] upd reference

---
 .../template_test_weights_compression.py      |  7 +--
 tests/openvino/native/models.py               |  6 +--
 .../quantization/test_weights_compression.py  | 44 +++++--------------
 tests/torch/ptq/test_weights_compression.py   | 42 ++++--------------
 4 files changed, 26 insertions(+), 73 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index d5bdf386300..31cfa669df1 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -147,8 +147,9 @@ def test_scale_estimation(self, mocker):
         model = self.get_model_for_test_scale_estimation()
 
         # prepare dataset with one input tensor
-        input = np.arange(0, 32 * 32, dtype=np.float32).reshape(1, 32, 32)
-        input[0, 15] *= 100  # make one channel relatively higher.
+        input = np.arange(0, 8 * 8, dtype=np.float32).reshape(1, 8, 8)
+        input[0, 4] *= 100  # make one channel relatively higher.
+
         input = self.to_tensor(input)
         dataset = Dataset([input])
 
@@ -156,7 +157,7 @@ def test_scale_estimation(self, mocker):
             model,
             mode=CompressWeightsMode.INT4_ASYM,
             ratio=1.0,
-            group_size=32,
+            group_size=4,
             scale_estimation=True,
             all_layers=True,
             dataset=dataset,
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index 56ef047f97f..c9ec2f8343d 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1187,11 +1187,11 @@ def _create_ov_model(self):
         return model
 
 
-class MLP(OVReferenceModel):
+class MatMul(OVReferenceModel):
     def _create_ov_model(self):
-        input_node = opset.parameter([1, 32, 32], name="Input")
+        input_node = opset.parameter([1, 8, 8], name="Input")
 
-        weights_data = np.arange(0, 32 * 32, dtype=np.float32).reshape(32, 32)
+        weights_data = np.arange(0, 8 * 8, dtype=np.float32).reshape(8, 8)
         weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights")
 
         matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul")
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index e0d072cb04a..6128b2d7829 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -50,13 +50,13 @@
 from tests.cross_fw.test_templates.template_test_weights_compression import ACTIVATION
 from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression
 from tests.openvino.native.common import get_actual_reference_for_current_openvino
-from tests.openvino.native.models import MLP
 from tests.openvino.native.models import AWQActMatmulModel
 from tests.openvino.native.models import AWQMatmulModel
 from tests.openvino.native.models import GatherAndMatmulShareData
 from tests.openvino.native.models import GatherWithTwoReductionAxes
 from tests.openvino.native.models import IdentityMatmul
 from tests.openvino.native.models import IntegerModel
+from tests.openvino.native.models import MatMul
 from tests.openvino.native.models import ModelNamedConsts
 from tests.openvino.native.models import OVReferenceModel
 from tests.openvino.native.models import SequentialMatmulModel
@@ -1550,43 +1550,19 @@ def check_weights(model: ov.Model, ref_ids: List[int]) -> None:
 
     @staticmethod
     def get_model_for_test_scale_estimation():
-        return MLP().ov_model
+        return MatMul().ov_model
 
     @staticmethod
     def get_scale_estimation_ref():
         return np.array(
             [
-                [[2.0666666]],
-                [[3.7624273]],
-                [[5.884783]],
-                [[8.03606]],
-                [[10.136832]],
-                [[12.291862]],
-                [[14.34415]],
-                [[16.449669]],
-                [[18.608639]],
-                [[20.802698]],
-                [[22.9477]],
-                [[25.083504]],
-                [[27.152409]],
-                [[29.141987]],
-                [[31.171442]],
-                [[33.044716]],
-                [[35.178047]],
-                [[37.31138]],
-                [[39.444714]],
-                [[41.578045]],
-                [[43.71138]],
-                [[45.844715]],
-                [[47.978046]],
-                [[50.11138]],
-                [[52.244713]],
-                [[54.378044]],
-                [[56.511383]],
-                [[58.644714]],
-                [[60.77805]],
-                [[62.91138]],
-                [[65.044716]],
-                [[67.17805]],
+                [[0.2], [0.41354424]],
+                [[0.6782236], [0.9470368]],
+                [[1.1691767], [1.4355733]],
+                [[1.7025099], [1.9689066]],
+                [[2.2722175], [2.543369]],
+                [[2.8146443], [3.0858421]],
+                [[3.3025098], [3.5689068]],
+                [[3.8358433], [4.1022396]],
             ]
         )
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index cee43f783d0..92f90190a0d 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -383,43 +383,19 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
 
     @staticmethod
     def get_model_for_test_scale_estimation():
-        return LinearModel(torch.arange(0, 32 * 32, dtype=torch.float32).reshape(32, 32))
+        return LinearModel(torch.arange(0, 8 * 8, dtype=torch.float32).reshape(8, 8))
 
     @staticmethod
     def get_scale_estimation_ref():
         return torch.tensor(
             [
-                [[2.0666666]],
-                [[3.7624271]],
-                [[5.8847833]],
-                [[8.0360603]],
-                [[10.1368332]],
-                [[12.2918606]],
-                [[14.3441496]],
-                [[16.4496689]],
-                [[18.6086369]],
-                [[20.8027000]],
-                [[22.9477024]],
-                [[25.0835018]],
-                [[27.1524105]],
-                [[29.1419849]],
-                [[31.1714401]],
-                [[33.0447121]],
-                [[35.1780472]],
-                [[37.3113823]],
-                [[39.4447136]],
-                [[41.5780487]],
-                [[43.7113838]],
-                [[45.8447189]],
-                [[47.9780464]],
-                [[50.1113815]],
-                [[52.2447128]],
-                [[54.3780441]],
-                [[56.5113831]],
-                [[58.6447144]],
-                [[60.7780533]],
-                [[62.9113808]],
-                [[65.0447083]],
-                [[67.1780548]],
+                [[0.200000], [0.413544]],
+                [[0.678224], [0.947037]],
+                [[1.169177], [1.435573]],
+                [[1.702510], [1.968907]],
+                [[2.272218], [2.543369]],
+                [[2.814644], [3.085842]],
+                [[3.302510], [3.568907]],
+                [[3.835843], [4.102240]],
             ]
         )

From 601f2e4e9290dc5162b6a3b4f2a7dcc3714ebe78 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 22 Jan 2025 10:47:11 +0100
Subject: [PATCH 38/46] #test: upd int4 weight locator for torch

---
 .../native/quantization/test_weights_compression.py    |  4 ++--
 tests/torch/ptq/test_weights_compression.py            | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 6128b2d7829..c45f98fa72d 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1545,8 +1545,8 @@ def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray:
     @staticmethod
     def check_weights(model: ov.Model, ref_ids: List[int]) -> None:
         names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4}
-        ref_nf4_nodes = {f"weights_{i}" for i in ref_ids}
-        assert ref_nf4_nodes == names
+        low_precision_nodes = {f"weights_{i}" for i in ref_ids}
+        assert low_precision_nodes == names
 
     @staticmethod
     def get_model_for_test_scale_estimation():
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 92f90190a0d..06c69887bff 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -375,11 +375,11 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor:
 
     @staticmethod
     def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
-        for i, op in enumerate(model.layers):
-            if i in ref_ids:
-                assert torch.numel(op.weight) == 8  # workaround to detect uint4 weights
-            else:
-                assert torch.numel(op.weight) == 16
+        low_precision_nodes = {f"{i}_weight" for i in ref_ids}
+        for op_name, op in model.nncf.external_op.items():
+            for name in low_precision_nodes:
+                if name in op_name:
+                    assert isinstance(op, INT4SymmetricWeightsDecompressor)
 
     @staticmethod
     def get_model_for_test_scale_estimation():

From 32bc0e59ab71a83e839389bd852b99d60eac9df0 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Wed, 22 Jan 2025 10:59:46 +0100
Subject: [PATCH 39/46] upd licence year

---
 tests/torch/ptq/test_weights_compression_backends.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/torch/ptq/test_weights_compression_backends.py b/tests/torch/ptq/test_weights_compression_backends.py
index bd66093ec19..fccdd0e2c01 100644
--- a/tests/torch/ptq/test_weights_compression_backends.py
+++ b/tests/torch/ptq/test_weights_compression_backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at

From 568809ce8d380f78526fd8c4fb966e52d6a61535 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Thu, 23 Jan 2025 16:41:10 +0100
Subject: [PATCH 40/46] rebase

---
 .../weight_compression/scale_estimation.py    |  5 +--
 .../weight_compression/torch_backend.py       | 39 -------------------
 2 files changed, 1 insertion(+), 43 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index e09bb77b7f2..0e812b71760 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -13,15 +13,14 @@
 from typing import Dict, List, Optional, Tuple, TypeVar
 
 import nncf
-from nncf import Dataset
 from nncf.common.graph.graph import NNCFGraph
 from nncf.common.logging.track_progress import track
-from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
@@ -94,8 +93,6 @@ def apply(
         graph: NNCFGraph,
         all_weight_params: List[WeightCompressionParameters],
         statistics: Dict[str, WCTensorStatistic],
-        statistic_points: Optional[StatisticPointsContainer] = None,
-        dataset: Optional[Dataset] = None,
         backend_entity: Optional[WeightCompressionAlgoBackend] = None,
     ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 1f843679737..d4741a6c2d1 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -37,12 +37,9 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
 from nncf.torch.dynamic_graph.scope import Scope
@@ -61,34 +58,6 @@
 from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
 
 
-def _prepare_inputs(
-    tensor: torch.Tensor, scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
-    tensor, scale = Tensor(tensor), Tensor(scale)
-    if zero_point is not None:
-        zero_point = Tensor(zero_point)
-    return tensor, scale, zero_point
-
-
-def get_compress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]:
-    def _forward_fn(inputs: Tuple) -> Tensor:
-        tensor, scale, zero_point = _prepare_inputs(*inputs)
-        quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
-        return quantized.data
-
-    return _forward_fn
-
-
-def get_compress_decompress_fn(config: WeightCompressionConfig) -> Callable[[Tuple], Tensor]:
-    def _forward_fn(inputs: Tuple) -> Tensor:
-        tensor, scale, zero_point = _prepare_inputs(*inputs)
-        quantized = calculate_quantized_weight(tensor, scale=scale, zero_point=zero_point, config=config)
-        dequantized = do_int_dequantization(quantized, scale=scale, zero_point=zero_point)
-        return dequantized.data
-
-    return _forward_fn
-
-
 class PTWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
     TARGET_TYPE_TO_PT_INS_TYPE_MAP = {
         TargetType.PRE_LAYER_OPERATION: TargetType.OPERATOR_PRE_HOOK,
@@ -241,14 +210,6 @@ def insert_adapters(
     ) -> None:
         pass
 
-    @staticmethod
-    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        return get_compress_decompress_fn(config)
-
-    @staticmethod
-    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
-        return get_compress_fn(config)
-
     @staticmethod
     def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
         def filter_func(point: StatisticPoint) -> bool:

From 8c7efd601ed71e082a7666adef51916f66364982 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 11:09:58 +0100
Subject: [PATCH 41/46] add test on scale estimation

---
 nncf/torch/engine.py                          |  1 -
 .../template_test_weights_compression.py      | 45 +++++++++++++++++--
 tests/openvino/native/models.py               |  4 +-
 .../quantization/test_weights_compression.py  | 43 ++++++++++++++----
 tests/torch/ptq/test_weights_compression.py   | 43 +++++++++++++-----
 5 files changed, 111 insertions(+), 25 deletions(-)

diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py
index 239c6857c24..fbbc1d083b3 100644
--- a/nncf/torch/engine.py
+++ b/nncf/torch/engine.py
@@ -48,5 +48,4 @@ def infer(
             return self._model(**input_data)
         if isinstance(input_data, tuple):
             return self._model(*input_data)
-
         return self._model(input_data)
diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index 31cfa669df1..0457d66ba4f 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -11,6 +11,7 @@
 import math
 from abc import ABC
 from abc import abstractmethod
+from copy import deepcopy
 from typing import List, TypeVar
 
 import numpy as np
@@ -143,13 +144,12 @@ def get_scale_estimation_ref():
         """
 
     def test_scale_estimation(self, mocker):
+        """Checks that scales match the reference."""
         calc_q_params_spy = mocker.spy(ScaleEstimation, "calculate_quantization_params")
         model = self.get_model_for_test_scale_estimation()
 
         # prepare dataset with one input tensor
-        input = np.arange(0, 8 * 8, dtype=np.float32).reshape(1, 8, 8)
-        input[0, 4] *= 100  # make one channel relatively higher.
-
+        input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
         input = self.to_tensor(input)
         dataset = Dataset([input])
 
@@ -157,10 +157,47 @@ def test_scale_estimation(self, mocker):
             model,
             mode=CompressWeightsMode.INT4_ASYM,
             ratio=1.0,
-            group_size=4,
+            group_size=8,
             scale_estimation=True,
             all_layers=True,
             dataset=dataset,
         )
         reference = self.get_scale_estimation_ref()
         assert fns.allclose(Tensor(reference), calc_q_params_spy.spy_return[0])
+
+    @abstractmethod
+    def get_orig_weight(model: TModel) -> Tensor:
+        """Returns original weight."""
+
+    @abstractmethod
+    def get_decompressed_weight(compressed_model: TModel, input: TTensor) -> Tensor:
+        """Returns decompressed weight"""
+
+    def test_scale_estimation_outlier_channel_has_lowest_error(self):
+        """Checks that outlier channel has a lowest error after quantization."""
+        OUTLIER_CHANNEL = 4
+        model = self.get_model_for_test_scale_estimation()
+
+        # prepare dataset with one input tensor
+        input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
+        input[
+            :, :, OUTLIER_CHANNEL
+        ] *= 1000  # make one channel relatively higher. This channel should have lowest error.
+        input = self.to_tensor(input)
+        dataset = Dataset([input])
+
+        compressed_model = compress_weights(
+            deepcopy(model),
+            mode=CompressWeightsMode.INT4_ASYM,
+            ratio=1.0,
+            group_size=-1,
+            scale_estimation=True,
+            all_layers=True,
+            dataset=dataset,
+        )
+
+        decompressed_weight = self.get_decompressed_weight(compressed_model, input)
+        original_weight = self.get_orig_weight(model)
+        diff = (decompressed_weight - original_weight) ** 2
+        layer_err = fns.mean(diff, axis=0) / fns.mean(original_weight**2, axis=0)
+        assert fns.argsort(layer_err)[0] == OUTLIER_CHANNEL
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index c9ec2f8343d..e5efdaf8235 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1189,9 +1189,9 @@ def _create_ov_model(self):
 
 class MatMul(OVReferenceModel):
     def _create_ov_model(self):
-        input_node = opset.parameter([1, 8, 8], name="Input")
+        input_node = opset.parameter([1, 4, 8], name="Input")
 
-        weights_data = np.arange(0, 8 * 8, dtype=np.float32).reshape(8, 8)
+        weights_data = np.arange(0, 16 * 8, dtype=np.float32).reshape(16, 8)
         weights_node = opset.constant(weights_data, dtype=np.float32, name="Weights")
 
         matmul_node = opset.matmul(input_node, weights_node, transpose_a=False, transpose_b=True, name="MatMul")
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 881f9596936..0a2adc6dd85 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -26,6 +26,7 @@
 from nncf.common.utils.debug import nncf_debug
 from nncf.data.dataset import Dataset
 from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase
+from nncf.openvino.graph.model_transformer import OVModelTransformer
 from nncf.openvino.graph.node_utils import get_const_value
 from nncf.parameters import BackupMode
 from nncf.quantization import compress_weights
@@ -1524,13 +1525,39 @@ def get_model_for_test_scale_estimation():
     def get_scale_estimation_ref():
         return np.array(
             [
-                [[0.2], [0.41354424]],
-                [[0.6782236], [0.9470368]],
-                [[1.1691767], [1.4355733]],
-                [[1.7025099], [1.9689066]],
-                [[2.2722175], [2.543369]],
-                [[2.8146443], [3.0858421]],
-                [[3.3025098], [3.5689068]],
-                [[3.8358433], [4.1022396]],
+                [[0.473328]],
+                [[0.929023]],
+                [[1.446527]],
+                [[1.920595]],
+                [[2.517053]],
+                [[3.030101]],
+                [[3.584278]],
+                [[4.04351]],
+                [[4.620007]],
+                [[5.165322]],
+                [[5.710637]],
+                [[6.122580]],
+                [[6.655914]],
+                [[7.237173]],
+                [[7.722581]],
+                [[8.255914]],
             ]
         )
+
+    @staticmethod
+    def get_orig_weight(model: ov.Model) -> Tensor:
+        for op in model.get_ordered_ops():
+            op_name = op.get_friendly_name()
+            if op.get_type_name() == "Constant" and op_name == "Weights":
+                return Tensor(op.data)
+
+    @staticmethod
+    def get_decompressed_weight(compressed_model: ov.Model, input: np.ndarray) -> Tensor:
+        # Insert extra output to get the compressed weights.
+        node = [op for op in compressed_model.get_ops() if op.get_friendly_name() == "Weights/fq_weights_1/convert"][0]
+        output = node.output(0)
+        extra_outputs = [(output, 0, None)]
+        model = OVModelTransformer._insert_outputs(compressed_model, extra_outputs)
+        compiled_model = ov.compile_model(model, device_name="CPU")
+        weight_output = compiled_model(input)[1]
+        return Tensor(weight_output)
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
index 06c69887bff..0889284b453 100644
--- a/tests/torch/ptq/test_weights_compression.py
+++ b/tests/torch/ptq/test_weights_compression.py
@@ -22,6 +22,7 @@
 from nncf import SensitivityMetric
 from nncf.quantization import compress_weights
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
+from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.torch import wrap_model
 from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor
@@ -63,6 +64,9 @@ def forward(self, x):
             x = layer(x)
         return x
 
+    def get_weight_names_in_exec_order(self):
+        return [f"{i}_weight" for i in range(len(self.main_values))]
+
 
 class MatMulModel(torch.nn.Module):
     def __init__(self, weight: torch.Tensor = torch.ones(size=(256, 256), dtype=torch.float32)):
@@ -375,7 +379,8 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor:
 
     @staticmethod
     def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
-        low_precision_nodes = {f"{i}_weight" for i in ref_ids}
+        all_names = model.get_weight_names_in_exec_order()
+        low_precision_nodes = list(map(lambda i: all_names[i], ref_ids))
         for op_name, op in model.nncf.external_op.items():
             for name in low_precision_nodes:
                 if name in op_name:
@@ -383,19 +388,37 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None:
 
     @staticmethod
     def get_model_for_test_scale_estimation():
-        return LinearModel(torch.arange(0, 8 * 8, dtype=torch.float32).reshape(8, 8))
+        return LinearModel(torch.arange(0, 8 * 16, dtype=torch.float32).reshape(16, 8))
 
     @staticmethod
     def get_scale_estimation_ref():
         return torch.tensor(
             [
-                [[0.200000], [0.413544]],
-                [[0.678224], [0.947037]],
-                [[1.169177], [1.435573]],
-                [[1.702510], [1.968907]],
-                [[2.272218], [2.543369]],
-                [[2.814644], [3.085842]],
-                [[3.302510], [3.568907]],
-                [[3.835843], [4.102240]],
+                [[0.473328]],
+                [[0.929023]],
+                [[1.446527]],
+                [[1.920595]],
+                [[2.517054]],
+                [[3.030102]],
+                [[3.584279]],
+                [[4.043509]],
+                [[4.620008]],
+                [[5.165322]],
+                [[5.710637]],
+                [[6.122581]],
+                [[6.655914]],
+                [[7.237174]],
+                [[7.722580]],
+                [[8.255914]],
             ]
         )
+
+    @staticmethod
+    def get_orig_weight(model: torch.nn.Module) -> Tensor:
+        return Tensor(model.linear.weight)
+
+    @staticmethod
+    def get_decompressed_weight(compressed_model: torch.nn.Module, input: torch.Tensor) -> Tensor:
+        weight = compressed_model.linear.weight
+        unpacked_w = compressed_model.nncf.external_op.weights_decompressor_linear_weight(weight)
+        return Tensor(unpacked_w)

From 64f588fa787842c499431b34cee10c5c83e9e09e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 14:10:07 +0100
Subject: [PATCH 42/46] add check on reducing error after SE

---
 .../template_test_weights_compression.py      | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index 0457d66ba4f..5c94cc30f22 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -22,8 +22,10 @@
 from nncf import SensitivityMetric
 from nncf.data.dataset import Dataset
 from nncf.quantization import compress_weights
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
+from nncf.quantization.algorithms.weight_compression.weight_lowering import quantize_dequantize_weight
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
@@ -39,6 +41,11 @@
 MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07
 
 
+def get_realtive_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor:
+    diff = (weight_1 - weight_2) ** 2
+    return fns.mean(diff, axis=axis) / fns.mean(weight_1**2, axis=axis)
+
+
 class TemplateWeightCompression(ABC):
     @staticmethod
     @abstractmethod
@@ -180,9 +187,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self):
 
         # prepare dataset with one input tensor
         input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
-        input[
-            :, :, OUTLIER_CHANNEL
-        ] *= 1000  # make one channel relatively higher. This channel should have lowest error.
+        input[:, :, OUTLIER_CHANNEL] *= 1000  # make one channel relatively higher, should have lowest error.
         input = self.to_tensor(input)
         dataset = Dataset([input])
 
@@ -196,8 +201,12 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self):
             dataset=dataset,
         )
 
-        decompressed_weight = self.get_decompressed_weight(compressed_model, input)
         original_weight = self.get_orig_weight(model)
-        diff = (decompressed_weight - original_weight) ** 2
-        layer_err = fns.mean(diff, axis=0) / fns.mean(original_weight**2, axis=0)
-        assert fns.argsort(layer_err)[0] == OUTLIER_CHANNEL
+        decompressed_weight_before_se = quantize_dequantize_weight(
+            original_weight, config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, -1), reduction_axes=1
+        )
+        decompressed_weight_after_se = self.get_decompressed_weight(compressed_model, input)
+        error_before_se = get_realtive_error(original_weight, decompressed_weight_before_se)
+        error_after_se = get_realtive_error(original_weight, decompressed_weight_after_se)
+        assert fns.argsort(error_after_se)[0] == OUTLIER_CHANNEL  # the smallest error on the outlier channel
+        assert error_before_se[OUTLIER_CHANNEL] > error_after_se[OUTLIER_CHANNEL]

From be92375626ae063e6b111f1d3485ae1a49c7b9c0 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 14:30:20 +0100
Subject: [PATCH 43/46] upd atol for model (difference across devices)

---
 tests/post_training/data/wc_reference_data.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 1309dbbc13c..5bed81c4c79 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -39,6 +39,7 @@ tinyllama_scale_estimation_per_channel_backend_TORCH:
   metric_value: 0.81389
   num_int4: 188
   num_int8: 124
+  atol: 0.006 # difference across devices: 0.80873 vs 0.81389
 tinyllama_data_aware_lora_stateful_backend_OV:
   metric_value: 0.83446
   num_int4: 94

From 9037dd2a684f16563a68f51ee1f9ca1955407264 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 15:09:22 +0100
Subject: [PATCH 44/46] no copy

---
 .../template_test_weights_compression.py              | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index 5c94cc30f22..ae34b1f4c21 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -11,7 +11,6 @@
 import math
 from abc import ABC
 from abc import abstractmethod
-from copy import deepcopy
 from typing import List, TypeVar
 
 import numpy as np
@@ -41,7 +40,7 @@
 MAX_BASELINE_SCORE = 1 / 1.1920928955078125e-07
 
 
-def get_realtive_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor:
+def get_relative_error(weight_1: Tensor, weight_2: Tensor, axis: int = 0) -> Tensor:
     diff = (weight_1 - weight_2) ** 2
     return fns.mean(diff, axis=axis) / fns.mean(weight_1**2, axis=axis)
 
@@ -184,6 +183,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self):
         """Checks that outlier channel has a lowest error after quantization."""
         OUTLIER_CHANNEL = 4
         model = self.get_model_for_test_scale_estimation()
+        original_weight = self.get_orig_weight(model)
 
         # prepare dataset with one input tensor
         input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
@@ -192,7 +192,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self):
         dataset = Dataset([input])
 
         compressed_model = compress_weights(
-            deepcopy(model),
+            model,
             mode=CompressWeightsMode.INT4_ASYM,
             ratio=1.0,
             group_size=-1,
@@ -201,12 +201,11 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self):
             dataset=dataset,
         )
 
-        original_weight = self.get_orig_weight(model)
         decompressed_weight_before_se = quantize_dequantize_weight(
             original_weight, config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, -1), reduction_axes=1
         )
         decompressed_weight_after_se = self.get_decompressed_weight(compressed_model, input)
-        error_before_se = get_realtive_error(original_weight, decompressed_weight_before_se)
-        error_after_se = get_realtive_error(original_weight, decompressed_weight_after_se)
+        error_before_se = get_relative_error(original_weight, decompressed_weight_before_se)
+        error_after_se = get_relative_error(original_weight, decompressed_weight_after_se)
         assert fns.argsort(error_after_se)[0] == OUTLIER_CHANNEL  # the smallest error on the outlier channel
         assert error_before_se[OUTLIER_CHANNEL] > error_after_se[OUTLIER_CHANNEL]

From 34570faa2883268a6c39d50bb2cd25de52bdb427 Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 15:24:38 +0100
Subject: [PATCH 45/46] new line

---
 tests/post_training/data/wc_reference_data.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 5bed81c4c79..864fe133d0a 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -64,4 +64,5 @@ tinyllama_scale_estimation_group_size_64_backend_OV:
 tinyllama_scale_estimation_group_size_64_backend_TORCH:
   metric_value: 0.8566
   num_int4: 94
-  num_int8: 124
\ No newline at end of file
+  num_int8: 124
+  
\ No newline at end of file

From 5e5440babae128b7ede8d6509de4c3a4b3d14d8e Mon Sep 17 00:00:00 2001
From: Aleksei Kashapov <aleksei.kashapov@intel.com>
Date: Fri, 24 Jan 2025 15:41:58 +0100
Subject: [PATCH 46/46] polishing

---
 nncf/torch/engine.py                                          | 1 +
 .../test_templates/template_test_weights_compression.py       | 4 ++--
 tests/post_training/data/wc_reference_data.yaml               | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nncf/torch/engine.py b/nncf/torch/engine.py
index fbbc1d083b3..ed70e8fb3a3 100644
--- a/nncf/torch/engine.py
+++ b/nncf/torch/engine.py
@@ -44,6 +44,7 @@ def infer(
         :param input_data: Inputs for the model.
         :return: Model outputs.
         """
+
         if isinstance(input_data, dict):
             return self._model(**input_data)
         if isinstance(input_data, tuple):
diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index ae34b1f4c21..6ffc479c1f3 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -137,14 +137,14 @@ def test_mixed_precision(self, mode, all_layers, ratio, ref_ids):
 
     @staticmethod
     @abstractmethod
-    def get_model_for_test_scale_estimation():
+    def get_model_for_test_scale_estimation() -> TModel:
         """
         Returns a backend model for test_scale_estimation.
         """
 
     @staticmethod
     @abstractmethod
-    def get_scale_estimation_ref():
+    def get_scale_estimation_ref() -> TTensor:
         """
         Returns the reference output of calculate_quantization_params of ScaleEstimation.
         """
diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 864fe133d0a..bb13d2eb26d 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -65,4 +65,3 @@ tinyllama_scale_estimation_group_size_64_backend_TORCH:
   metric_value: 0.8566
   num_int4: 94
   num_int8: 124
-  
\ No newline at end of file