Merge branch 'main' into latent-consistency

huggingface · Nov 6, 2023 · e7e3922 · e7e3922
2 parents 7b7e773 + bf8e95c
commit e7e3922
Show file tree

Hide file tree

Showing 25 changed files with 979 additions and 500 deletions.
diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
@@ -30,7 +30,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install .[neural-compressor,ipex,diffusers,tests]
+        pip install .[neural-compressor,diffusers,tests]
+        pip install intel-extension-for-pytorch
     - name: Test with Pytest
       run: |
         pytest tests/neural_compressor/
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -38,3 +38,9 @@ jobs:
     - name: Test with Pytest
       run: |
         pytest tests/openvino/ --ignore test_modeling_basic
+    - name: Test openvino-nightly import
+      run: |
+        pip uninstall -y openvino
+        pip install openvino-nightly
+        python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
+
diff --git a/README.md b/README.md
@@ -67,26 +67,51 @@ For more details on the supported compression techniques, please refer to the [d
 
 Below are the examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/latest/tmo_introduction.html) framework to accelerate inference.
 
+#### Export:
+
+It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI :
+
+```plain
+optimum-cli export openvino --model gpt2 ov_model
+```
+
+If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision.
+
+```plain
+optimum-cli export openvino --model gpt2 --int8 ov_model
+```
+
+
 #### Inference:
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
-If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR.
+
 
 ```diff
-- from transformers import AutoModelForSequenceClassification
-+ from optimum.intel import OVModelForSequenceClassification
+- from transformers import AutoModelForSeq2SeqLM
++ from optimum.intel import OVModelForSeq2SeqLM
   from transformers import AutoTokenizer, pipeline
 
-  model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-- model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
+  model_id = "echarlaix/t5-small-openvino"
+- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
++ model = OVModelForSeq2SeqLM.from_pretrained(model_id)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
-  model.save_pretrained("./distilbert")
+  pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+  results = pipe("He never went out without a book under his arm, and he often came back with two.")
 
-  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
-  results = classifier("He's a dreadful magician.")
+  [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
+If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR.
+
+```python
+from optimum.intel import OVModelForCausalLM
+
+model = OVModelForCausalLM.from_pretrained("gpt2", export=True)
+model.save_pretrained("./ov_model")
+```
+
+
 #### Post-training static quantization:
 
 Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. Here is an example on how to apply static quantization on a fine-tuned DistilBERT.

diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx
diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx
@@ -62,6 +62,27 @@ tokenizer.save_pretrained(save_dir)
 
 The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.
 
+### Weights compression
+
+For large language models (LLMs), it is often beneficial to only quantize weights, and keep activations in floating point precision. This method does not require a calibration dataset. To enable weights compression, set the `weights_only` parameter of `OVQuantizer`:
+
+```python
+from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM
+from transformers import AutoModelForCausalLM
+
+save_dir = "int8_weights_compressed_model"
+model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b")
+quantizer = OVQuantizer.from_pretrained(model, task="text-generation")
+quantizer.quantize(save_directory=save_dir, weights_only=True)
+```
+
+To load the optimized model for inference:
+
+```python
+optimized_model = OVModelForCausalLM.from_pretrained(save_dir)
+```
+
+Weights compression is enabled for PyTorch and OpenVINO models: the starting model can be an `AutoModelForCausalLM` or `OVModelForCausalLM` instance.
 
 ## Training-time optimization
 
@@ -221,4 +242,4 @@ text = "He's a dreadful magician."
 outputs = cls_pipe(text)
 
 [{'label': 'NEGATIVE', 'score': 0.9840195178985596}]
-```
+```
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -18,7 +18,7 @@
 from typing import Any, Callable, Dict, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import __main__ as optimum_main
@@ -27,7 +27,6 @@
 from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 
 from ...intel.utils.import_utils import is_nncf_available
-from ...intel.utils.modeling_utils import patch_decoder_attention_mask
 from .convert import export_models
 
 
@@ -137,6 +136,41 @@ def main_export(
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
+    # Patch the modules to export of GPTQ models w/o GPU
+    do_gptq_patching = False
+    try:
+        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+        config_dict = config.to_dict()
+        quantization_config = config_dict.get("quantization_config", None)
+        do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
+    except Exception:
+        pass
+
+    if do_gptq_patching:
+        import torch
+
+        torch.set_default_dtype(torch.float32)
+        orig_cuda_check = torch.cuda.is_available
+        torch.cuda.is_available = lambda: True
+
+        from optimum.gptq import GPTQQuantizer
+
+        orig_post_init_model = GPTQQuantizer.post_init_model
+
+        def post_init_model(self, model):
+            from auto_gptq import exllama_set_max_input_length
+
+            class StoreAttr(object):
+                pass
+
+            model.quantize_config = StoreAttr()
+            model.quantize_config.desc_act = self.desc_act
+            if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
+                model = exllama_set_max_input_length(model, self.max_input_length)
+            return model
+
+        GPTQQuantizer.post_init_model = post_init_model
+
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
 
     # get the shapes to be used to generate dummy inputs
@@ -222,24 +256,18 @@ def main_export(
     preprocessors = maybe_load_preprocessors(
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
-    if not task.startswith("text-generation"):
-        onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
-            model=model,
-            task=task,
-            monolith=False,
-            custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
-            custom_architecture=custom_architecture,
-            fn_get_submodels=fn_get_submodels,
-            preprocessors=preprocessors,
-            _variant="default",
-        )
-    else:
-        # TODO : ModelPatcher will be added in next optimum release
-        model = patch_decoder_attention_mask(model)
 
-        onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
-        onnx_config = onnx_config_constructor(model.config)
-        models_and_onnx_configs = {"model": (model, onnx_config)}
+    onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
+        model=model,
+        task=task,
+        monolith=False,
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
+        custom_architecture=custom_architecture,
+        fn_get_submodels=fn_get_submodels,
+        preprocessors=preprocessors,
+        _variant="default",
+        legacy=False,
+    )
 
     if int8 is None:
         int8 = False
@@ -324,3 +352,8 @@ def main_export(
         int8=int8,
         model_kwargs=model_kwargs,
     )
+
+    # Unpatch modules after GPTQ export
+    if do_gptq_patching:
+        torch.cuda.is_available = orig_cuda_check
+        GPTQQuantizer.post_init_model = orig_post_init_model