Merge branch 'main' into nncf-210-update

huggingface · Apr 22, 2024 · 2b79a0d · 2b79a0d
2 parents f99f767 + 673b88b
commit 2b79a0d
Show file tree

Hide file tree

Showing 31 changed files with 843 additions and 433 deletions.
diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
@@ -32,11 +32,17 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-pytorch==2.1.100
-        pip install intel-extension-for-transformers==1.3.2
+        pip install intel-extension-for-transformers
         pip install peft
+
     - name: Test with Pytest
       run: |
-        pytest tests/neural_compressor/
+        pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
+    - name: Test IPEX
+      run: |
+        pip uninstall -y intel-extension-for-transformers
+        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.1.100
+        pytest tests/neural_compressor/test_ipex.py
+
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -35,7 +35,11 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic --durations=0
+        pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
+    - name: Test basic
+      run: |
+        pip uninstall -y nncf
+        pytest tests/openvino/test_modeling_basic.py
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino

diff --git a/README.md b/README.md
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
+Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
+
+```plain
+optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
+```
+
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
@@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher
 
 ```python
 from functools import partial
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
 # The directory where the quantized model will be saved
 save_dir = "nncf_results"
 # Apply static quantization and save the resulting model in the OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Load the quantized model
 optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 ```

diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx
@@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o
 
 ```python
 from transformers import AutoTokenizer
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
 model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
@@ -95,7 +95,8 @@ save_dir = "ptq_model"
 quantizer = OVQuantizer.from_pretrained(model)
 
 # Apply static quantization and export the resulting quantized model to OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Save the tokenizer
 tokenizer.save_pretrained(save_dir)
 ```

diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py
@@ -64,8 +64,7 @@
 
 
 if is_intel_extension_for_transformers_available():
-    from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
-
+    from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
@@ -227,8 +226,9 @@ class OptimizationArguments:
         metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
     )
     quantization_methodology: str = field(
-        default="RTN",
-        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
+        choices=["rtn", "gptq"],
+        default="rtn",
+        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
     )
     damp_percent: float = field(
         default=0.01,
@@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
                     raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
                 if optim_args.apply_pruning or optim_args.apply_distillation:
                     raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
-                if optim_args.quantization_methodology == "GPTQ":
-                    algorithm_args = {
-                        "act_order": False,
-                        "percdamp": optim_args.damp_percent,
-                        "block_size": optim_args.gptq_block_size,
-                        "nsamples": optim_args.num_calibration_samples,
-                        "use_max_length": optim_args.use_max_length,
-                        "pad_max_length": optim_args.pad_max_length,
-                    }
-                quantization_config = WeightOnlyQuantConfig(
-                    weight_dtype=optim_args.weight_dtype,
-                    group_size=optim_args.group_size,
-                    scheme=optim_args.weight_only_scheme,
-                    algorithm=optim_args.quantization_methodology,
-                    algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
-                )
+
+                algorithm_args = {
+                    "weight_dtype": optim_args.weight_dtype,
+                    "sym": optim_args.weight_only_scheme == "sym",
+                    "group_size": optim_args.group_size,
+                }
+
+                if optim_args.quantization_methodology == "gptq":
+                    quantization_config = GPTQConfig(
+                        damp_percent=optim_args.damp_percent,
+                        nsamples=optim_args.num_calibration_samples,
+                        blocksize=optim_args.gptq_block_size,
+                        **algorithm_args,
+                    )
+                else:
+                    quantization_config = RtnConfig(**algorithm_args)
+
             else:
                 quantization_config = PostTrainingQuantConfig(
                     approach=optim_args.quantization_approach, recipes=recipes

diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb
@@ -51,7 +51,7 @@
     "import transformers\n",
     "from evaluate import evaluator\n",
     "from openvino.runtime import Core\n",
-    "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n",
+    "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n",
     "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
     "\n",
     "transformers.logging.set_verbosity_error()\n",
@@ -286,11 +286,11 @@
     "**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n",
     "\n",
     "```\n",
-    "from optimum.intel.openvino import OVConfig\n",
+    "from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n",
     "\n",
-    "ov_config = OVConfig()\n",
-    "ov_config.compression[\"overflow_fix\"] = \"enable\"\n",
-    "quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n",
     "```\n",
     "\n",
     "For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)"
@@ -317,7 +317,8 @@
     "\n",
     "# Quantize the model\n",
     "quantizer = OVQuantizer.from_pretrained(model)\n",
-    "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)"
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)"
    ]
   },
   {

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -19,6 +19,7 @@
 from typing import TYPE_CHECKING, Optional
 
 from ...exporters import TasksManager
+from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
     )
+    optional_group.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help=(
+            "The dataset used for data-aware compression or quantization with NNCF. "
+            "You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs "
+            "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -115,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
         ),
     )
+    optional_group.add_argument(
+        "--disable-convert-tokenizer",
+        action="store_true",
+        help="Do not add converted tokenizer and detokenizer OpenVINO models.",
+    )
     optional_group.add_argument(
         "--convert-tokenizer",
         action="store_true",
-        help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
+        help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
     )
 
     optional_group.add_argument(
@@ -195,20 +211,68 @@ def run(self):
                 )
                 quantization_config["sym"] = "asym" not in self.args.weight_format
                 quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
+            quantization_config["dataset"] = self.args.dataset
             ov_config = OVConfig(quantization_config=quantization_config)
 
-        # TODO : add input shapes
-        main_export(
-            model_name_or_path=self.args.model,
-            output=self.args.output,
-            task=self.args.task,
-            framework=self.args.framework,
-            cache_dir=self.args.cache_dir,
-            trust_remote_code=self.args.trust_remote_code,
-            pad_token_id=self.args.pad_token_id,
-            ov_config=ov_config,
-            stateful=not self.args.disable_stateful,
-            convert_tokenizer=self.args.convert_tokenizer,
-            library_name=self.args.library
-            # **input_shapes,
-        )
+        library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
+        if library_name == "sentence_transformers" and self.args.library is None:
+            logger.warning(
+                "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
+                "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
+            )
+            library_name = "transformers"
+
+        if (
+            library_name == "diffusers"
+            and ov_config
+            and ov_config.quantization_config
+            and ov_config.quantization_config.dataset is not None
+        ):
+            if not is_diffusers_available():
+                raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
+
+            from diffusers import DiffusionPipeline
+
+            diffusers_config = DiffusionPipeline.load_config(self.args.model)
+            class_name = diffusers_config.get("_class_name", None)
+
+            if class_name == "LatentConsistencyModelPipeline":
+                from optimum.intel import OVLatentConsistencyModelPipeline
+
+                model_cls = OVLatentConsistencyModelPipeline
+
+            elif class_name == "StableDiffusionXLPipeline":
+                from optimum.intel import OVStableDiffusionXLPipeline
+
+                model_cls = OVStableDiffusionXLPipeline
+            elif class_name == "StableDiffusionPipeline":
+                from optimum.intel import OVStableDiffusionPipeline
+
+                model_cls = OVStableDiffusionPipeline
+            else:
+                raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
+
+            model = model_cls.from_pretrained(
+                self.args.model, export=True, quantization_config=ov_config.quantization_config
+            )
+            model.save_pretrained(self.args.output)
+
+        else:
+            if self.args.convert_tokenizer:
+                logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
+
+            # TODO : add input shapes
+            main_export(
+                model_name_or_path=self.args.model,
+                output=self.args.output,
+                task=self.args.task,
+                framework=self.args.framework,
+                cache_dir=self.args.cache_dir,
+                trust_remote_code=self.args.trust_remote_code,
+                pad_token_id=self.args.pad_token_id,
+                ov_config=ov_config,
+                stateful=not self.args.disable_stateful,
+                convert_tokenizer=not self.args.disable_convert_tokenizer,
+                library_name=library_name,
+                # **input_shapes,
+            )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -22,11 +22,10 @@
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
+from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
+from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
-from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
-from .convert import export_from_model, export_tokenizer
-
 
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
@@ -77,7 +76,7 @@ def main_export(
         model_name_or_path (`str`):
             Model ID on huggingface.co or path on disk to the model repository to export.
         output (`Union[str, Path]`):
-            Path indicating the directory where to store the generated ONNX model.
+            Path indicating the directory where to store the generated OpenVINO model.
 
         > Optional parameters
 
@@ -187,12 +186,6 @@ def main_export(
                 f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
             )
 
-    if convert_tokenizer and not is_openvino_tokenizers_available():
-        logger.warning(
-            "`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
-        )
-        convert_tokenizer = False
-
     do_gptq_patching = False
     custom_architecture = False
     loading_kwargs = {}
@@ -348,7 +341,7 @@ class StoreAttr(object):
         **kwargs_shapes,
     )
 
-    if convert_tokenizer:
+    if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":
             tokenizer = next(
                 (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
@@ -371,6 +364,8 @@ class StoreAttr(object):
             tokenizer_2 = getattr(model, "tokenizer_2", None)
             if tokenizer_2 is not None:
                 export_tokenizer(tokenizer_2, output, suffix="_2")
+    elif convert_tokenizer and not is_openvino_tokenizers_available():
+        logger.warning("Tokenizer won't be converted.")
 
     # Unpatch modules after GPTQ export
     if do_gptq_patching: