Skip to content

Commit

Permalink
Merge branch 'main' into nncf-210-update
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Apr 22, 2024
2 parents f99f767 + 673b88b commit 2b79a0d
Show file tree
Hide file tree
Showing 31 changed files with 843 additions and 433 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,17 @@ jobs:
python -m pip install --upgrade pip
pip install cmake
pip install py-cpuinfo
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-pytorch==2.1.100
pip install intel-extension-for-transformers==1.3.2
pip install intel-extension-for-transformers
pip install peft
- name: Test with Pytest
run: |
pytest tests/neural_compressor/
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
- name: Test IPEX
run: |
pip uninstall -y intel-extension-for-transformers
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install intel-extension-for-pytorch==2.1.100
pytest tests/neural_compressor/test_ipex.py
6 changes: 5 additions & 1 deletion .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ jobs:
pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
- name: Test with Pytest
run: |
pytest tests/openvino/ --ignore test_modeling_basic --durations=0
pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
- name: Test basic
run: |
pip uninstall -y nncf
pytest tests/openvino/test_modeling_basic.py
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
optimum-cli export openvino --model gpt2 ov_model
```

You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.

```plain
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
```

Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.

```plain
optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
```

To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).

#### Inference:
Expand Down Expand Up @@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher

```python
from functools import partial
from optimum.intel import OVQuantizer, OVModelForSequenceClassification
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
Expand All @@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
# The directory where the quantized model will be saved
save_dir = "nncf_results"
# Apply static quantization and save the resulting model in the OpenVINO IR format
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
# Load the quantized model
optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
```
Expand Down
5 changes: 3 additions & 2 deletions docs/source/optimization_ov.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o

```python
from transformers import AutoTokenizer
from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
Expand All @@ -95,7 +95,8 @@ save_dir = "ptq_model"
quantizer = OVQuantizer.from_pretrained(model)

# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)
```
Expand Down
41 changes: 21 additions & 20 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""

Expand Down Expand Up @@ -227,8 +226,9 @@ class OptimizationArguments:
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
)
quantization_methodology: str = field(
default="RTN",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
choices=["rtn", "gptq"],
default="rtn",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
)
damp_percent: float = field(
default=0.01,
Expand Down Expand Up @@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
if optim_args.quantization_methodology == "GPTQ":
algorithm_args = {
"act_order": False,
"percdamp": optim_args.damp_percent,
"block_size": optim_args.gptq_block_size,
"nsamples": optim_args.num_calibration_samples,
"use_max_length": optim_args.use_max_length,
"pad_max_length": optim_args.pad_max_length,
}
quantization_config = WeightOnlyQuantConfig(
weight_dtype=optim_args.weight_dtype,
group_size=optim_args.group_size,
scheme=optim_args.weight_only_scheme,
algorithm=optim_args.quantization_methodology,
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
)

algorithm_args = {
"weight_dtype": optim_args.weight_dtype,
"sym": optim_args.weight_only_scheme == "sym",
"group_size": optim_args.group_size,
}

if optim_args.quantization_methodology == "gptq":
quantization_config = GPTQConfig(
damp_percent=optim_args.damp_percent,
nsamples=optim_args.num_calibration_samples,
blocksize=optim_args.gptq_block_size,
**algorithm_args,
)
else:
quantization_config = RtnConfig(**algorithm_args)

else:
quantization_config = PostTrainingQuantConfig(
approach=optim_args.quantization_approach, recipes=recipes
Expand Down
13 changes: 7 additions & 6 deletions notebooks/openvino/question_answering_quantization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"import transformers\n",
"from evaluate import evaluator\n",
"from openvino.runtime import Core\n",
"from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n",
"from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n",
"from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
"\n",
"transformers.logging.set_verbosity_error()\n",
Expand Down Expand Up @@ -286,11 +286,11 @@
"**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n",
"\n",
"```\n",
"from optimum.intel.openvino import OVConfig\n",
"from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n",
"\n",
"ov_config = OVConfig()\n",
"ov_config.compression[\"overflow_fix\"] = \"enable\"\n",
"quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n",
"ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n",
"quantizer = OVQuantizer.from_pretrained(model)\n",
"quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n",
"```\n",
"\n",
"For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)"
Expand All @@ -317,7 +317,8 @@
"\n",
"# Quantize the model\n",
"quantizer = OVQuantizer.from_pretrained(model)\n",
"quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)"
"ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
"quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)"
]
},
{
Expand Down
96 changes: 80 additions & 16 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import TYPE_CHECKING, Optional

from ...exporters import TasksManager
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
from ..base import BaseOptimumCLICommand, CommandInfo


Expand Down Expand Up @@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
default=None,
help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
)
optional_group.add_argument(
"--dataset",
type=str,
default=None,
help=(
"The dataset used for data-aware compression or quantization with NNCF. "
"You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs "
"or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
),
)
optional_group.add_argument(
"--disable-stateful",
action="store_true",
Expand All @@ -115,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
"OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
),
)
optional_group.add_argument(
"--disable-convert-tokenizer",
action="store_true",
help="Do not add converted tokenizer and detokenizer OpenVINO models.",
)
optional_group.add_argument(
"--convert-tokenizer",
action="store_true",
help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
)

optional_group.add_argument(
Expand Down Expand Up @@ -195,20 +211,68 @@ def run(self):
)
quantization_config["sym"] = "asym" not in self.args.weight_format
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
quantization_config["dataset"] = self.args.dataset
ov_config = OVConfig(quantization_config=quantization_config)

# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
output=self.args.output,
task=self.args.task,
framework=self.args.framework,
cache_dir=self.args.cache_dir,
trust_remote_code=self.args.trust_remote_code,
pad_token_id=self.args.pad_token_id,
ov_config=ov_config,
stateful=not self.args.disable_stateful,
convert_tokenizer=self.args.convert_tokenizer,
library_name=self.args.library
# **input_shapes,
)
library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
if library_name == "sentence_transformers" and self.args.library is None:
logger.warning(
"Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
"`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
)
library_name = "transformers"

if (
library_name == "diffusers"
and ov_config
and ov_config.quantization_config
and ov_config.quantization_config.dataset is not None
):
if not is_diffusers_available():
raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))

from diffusers import DiffusionPipeline

diffusers_config = DiffusionPipeline.load_config(self.args.model)
class_name = diffusers_config.get("_class_name", None)

if class_name == "LatentConsistencyModelPipeline":
from optimum.intel import OVLatentConsistencyModelPipeline

model_cls = OVLatentConsistencyModelPipeline

elif class_name == "StableDiffusionXLPipeline":
from optimum.intel import OVStableDiffusionXLPipeline

model_cls = OVStableDiffusionXLPipeline
elif class_name == "StableDiffusionPipeline":
from optimum.intel import OVStableDiffusionPipeline

model_cls = OVStableDiffusionPipeline
else:
raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")

model = model_cls.from_pretrained(
self.args.model, export=True, quantization_config=ov_config.quantization_config
)
model.save_pretrained(self.args.output)

else:
if self.args.convert_tokenizer:
logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")

# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
output=self.args.output,
task=self.args.task,
framework=self.args.framework,
cache_dir=self.args.cache_dir,
trust_remote_code=self.args.trust_remote_code,
pad_token_id=self.args.pad_token_id,
ov_config=ov_config,
stateful=not self.args.disable_stateful,
convert_tokenizer=not self.args.disable_convert_tokenizer,
library_name=library_name,
# **input_shapes,
)
17 changes: 6 additions & 11 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@
from optimum.exporters import TasksManager
from optimum.exporters.onnx.base import OnnxConfig
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
from optimum.utils.save_utils import maybe_load_preprocessors

from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
from .convert import export_from_model, export_tokenizer


if TYPE_CHECKING:
from optimum.intel.openvino.configuration import OVConfig
Expand Down Expand Up @@ -77,7 +76,7 @@ def main_export(
model_name_or_path (`str`):
Model ID on huggingface.co or path on disk to the model repository to export.
output (`Union[str, Path]`):
Path indicating the directory where to store the generated ONNX model.
Path indicating the directory where to store the generated OpenVINO model.
> Optional parameters
Expand Down Expand Up @@ -187,12 +186,6 @@ def main_export(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)

if convert_tokenizer and not is_openvino_tokenizers_available():
logger.warning(
"`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
)
convert_tokenizer = False

do_gptq_patching = False
custom_architecture = False
loading_kwargs = {}
Expand Down Expand Up @@ -348,7 +341,7 @@ class StoreAttr(object):
**kwargs_shapes,
)

if convert_tokenizer:
if convert_tokenizer and is_openvino_tokenizers_available():
if library_name != "diffusers":
tokenizer = next(
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
Expand All @@ -371,6 +364,8 @@ class StoreAttr(object):
tokenizer_2 = getattr(model, "tokenizer_2", None)
if tokenizer_2 is not None:
export_tokenizer(tokenizer_2, output, suffix="_2")
elif convert_tokenizer and not is_openvino_tokenizers_available():
logger.warning("Tokenizer won't be converted.")

# Unpatch modules after GPTQ export
if do_gptq_patching:
Expand Down
Loading

0 comments on commit 2b79a0d

Please sign in to comment.