diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index fd5fd16509..3a15214f99 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -30,7 +30,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[neural-compressor,ipex,diffusers,tests] + pip install .[neural-compressor,diffusers,tests] + pip install intel-extension-for-pytorch - name: Test with Pytest run: | pytest tests/neural_compressor/ diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 20d4f3192c..cd0b6a7675 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,3 +38,9 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/ --ignore test_modeling_basic + - name: Test openvino-nightly import + run: | + pip uninstall -y openvino + pip install openvino-nightly + python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" + diff --git a/README.md b/README.md index e06f91ef17..9f25eefd94 100644 --- a/README.md +++ b/README.md @@ -67,26 +67,51 @@ For more details on the supported compression techniques, please refer to the [d Below are the examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/latest/tmo_introduction.html) framework to accelerate inference. +#### Export: + +It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : + +```plain +optimum-cli export openvino --model gpt2 ov_model +``` + +If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision. + +```plain +optimum-cli export openvino --model gpt2 --int8 ov_model +``` + + #### Inference: To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. -If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR. + ```diff -- from transformers import AutoModelForSequenceClassification -+ from optimum.intel import OVModelForSequenceClassification +- from transformers import AutoModelForSeq2SeqLM ++ from optimum.intel import OVModelForSeq2SeqLM from transformers import AutoTokenizer, pipeline - model_id = "distilbert-base-uncased-finetuned-sst-2-english" -- model = AutoModelForSequenceClassification.from_pretrained(model_id) -+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) + model_id = "echarlaix/t5-small-openvino" +- model = AutoModelForSeq2SeqLM.from_pretrained(model_id) ++ model = OVModelForSeq2SeqLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - model.save_pretrained("./distilbert") + pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) + results = pipe("He never went out without a book under his arm, and he often came back with two.") - classifier = pipeline("text-classification", model=model, tokenizer=tokenizer) - results = classifier("He's a dreadful magician.") + [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] ``` +If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR. + +```python +from optimum.intel import OVModelForCausalLM + +model = OVModelForCausalLM.from_pretrained("gpt2", export=True) +model.save_pretrained("./ov_model") +``` + + #### Post-training static quantization: Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. Here is an example on how to apply static quantization on a fine-tuned DistilBERT. diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index f0a6d2edab..bfd15bde11 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -11,34 +11,82 @@ specific language governing permissions and limitations under the License. Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/models?library=openvino&sort=downloads) and create pipelines to run inference with OpenVINO Runtime without rewriting your APIs. -## Switching from Transformers to Optimum +## Transformers models You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). For that, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. -To load a Transformers model and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model. -Here is an example on how to perform inference with OpenVINO Runtime for a text classification class: +As shown in the table below, each task is associated with a class enabling to automatically load your model. + +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-classification` | `OVModelForSequenceClassification` | +| `token-classification` | `OVModelForTokenClassification` | +| `question-answering` | `OVModelForQuestionAnswering` | +| `audio-classification` | `OVModelForAudioClassification` | +| `image-classification` | `OVModelForImageClassification` | +| `feature-extraction` | `OVModelForFeatureExtraction` | +| `fill-mask` | `OVModelForMaskedLM` | +| `text-generation` | `OVModelForCausalLM` | +| `text2text-generation` | `OVModelForSeq2SeqLM` | + + +### Export + +It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : + +```bash +optimum-cli export openvino --model gpt2 ov_model +``` + +The example above illustrates exporting a checkpoint from the 🤗 Hub. When exporting a local model, first make sure that you saved both the model’s weights and tokenizer files in the same directory (`local_path`). +When using CLI, pass the `local_path` to the model argument instead of the checkpoint name of the model hosted on the Hub and provide the `--task` argument. You can review the list of supported tasks in the 🤗 [Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). If task argument is not provided, it will default to the model architecture without any task specific head. +Here we set the `task` to `text-generation-with-past`, with the `-with-past` suffix enabling the re-use of the pre-computed key/values hidden-states `use_cache=True`. + +```bash +optimum-cli export openvino --model local_path --task text-generation-with-past ov_model +``` + +Once the model is exported, you can load the OpenVINO model using : + +```python +from optimum.intel import AutoModelForCausalLM + +model_id = "helenai/gpt2-ov" +model = AutoModelForCausalLM.from_pretrained(model_id) +``` + +You can also load your PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, by setting `export=True` when loading your model. + +```python +from optimum.intel import AutoModelForCausalLM + +model_id = "gpt2" +model = AutoModelForCausalLM.from_pretrained(model_id, export=True) +model.save_pretrained("ov_model") +``` + +### Inference + +You can load an OpenVINO hosted on the hub and perform inference, no need to adapt your code to get it to work with `OVModelForXxx` classes: ```diff -- from transformers import AutoModelForSequenceClassification -+ from optimum.intel import OVModelForSequenceClassification +- from transformers import AutoModelForCausalLM ++ from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline - model_id = "distilbert-base-uncased-finetuned-sst-2-english" -- model = AutoModelForSequenceClassification.from_pretrained(model_id) -+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) + model_id = "helenai/gpt2-ov" +- model = AutoModelForCausalLM.from_pretrained(model_id) ++ model = OVModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - cls_pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) - outputs = cls_pipe("He's a dreadful magician.") - - [{'label': 'NEGATIVE', 'score': 0.9919503927230835}] + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + results = pipe("He's a dreadful magician and") ``` See the [reference documentation](reference_ov) for more information about parameters, and examples for different tasks. To easily save the resulting model, you can use the `save_pretrained()` method, which will save both the BIN and XML files describing the graph. It is useful to save the tokenizer to the same directory, to enable easy loading of the tokenizer for the model. - ```python # Save the exported model save_directory = "openvino_distilbert" @@ -46,6 +94,28 @@ model.save_pretrained(save_directory) tokenizer.save_pretrained(save_directory) ``` +### Weight only quantization + +You can also apply INT8 quantization on your models weights when exporting your model with the CLI: + +```bash +optimum-cli export openvino --model gpt2 --int8 ov_model +``` + +This will results in the exported model linear and embedding layers to be quanrtized to INT8, the activations will be kept in floating point precision. + +This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method. + +```python +from optimum.intel import OVModelForCausalLM + +model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) +``` + +To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization). + +### Static shape + By default, `OVModelForXxx` support dynamic shapes, enabling inputs of every shapes. To speed up inference, static shapes can be enabled by giving the desired inputs shapes. ```python @@ -55,7 +125,6 @@ model.reshape(1, 9) model.compile() ``` - When fixing the shapes with the `reshape()` method, inference cannot be performed with an input of a different shape. When instantiating your pipeline, you can specify the maximum total input sequence length after tokenization in order for shorter sequences to be padded and for longer sequences to be truncated. ```python @@ -81,16 +150,7 @@ qa_pipe = pipeline( metric = task_evaluator.compute(model_or_pipeline=qa_pipe, data=eval_dataset, metric="squad") ``` - -To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference). - -```python -# Static shapes speed up inference -model.reshape(1, 9) -model.to("gpu") -# Compile the model before the first inference -model.compile() -``` +### Compilation By default the model will be compiled when instantiating our `OVModel`. In the case where the model is reshaped or placed to another device, the model will need to be recompiled again, which will happen by default before the first inference (thus inflating the latency of the first inference). To avoid an unnecessary compilation, you can disable the first compilation by setting `compile=False`. The model can be compiled before the first inference with `model.compile()`. @@ -106,6 +166,19 @@ model.reshape(1,128) model.compile() ``` +To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference). + +```python +# Static shapes speed up inference +model.reshape(1, 9) +model.to("gpu") +# Compile the model before the first inference +model.compile() +``` + +### Configuration + + It is possible to pass an `ov_config` parameter to `from_pretrained()` with custom OpenVINO configuration values. This can be used for example to enable full precision inference on devices where FP16 or BF16 inference precision is used by default. @@ -120,7 +193,7 @@ Optimum Intel leverages OpenVINO's model caching to speed up model compiling. By model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"CACHE_DIR":""}) ``` -## Sequence-to-sequence models +### Sequence-to-sequence models Sequence-to-sequence (Seq2Seq) models, that generate a new sequence from an input, can also be used when running inference with OpenVINO. When Seq2Seq models are exported to the OpenVINO IR, they are decomposed into two parts : the encoder and the "decoder" (which actually consists of the decoder with the language modeling head), that are later combined during inference. To speed up sequential decoding, a cache with pre-computed key/values hidden-states will be used by default. An additional model component will be exported: the "decoder" with pre-computed key/values as one of its inputs. This specific export comes from the fact that during the first pass, the decoder has no pre-computed key/values hidden-states, while during the rest of the generation past key/values will be used to speed up sequential decoding. To disable this cache, set `use_cache=False` in the `from_pretrained()` method. @@ -147,23 +220,33 @@ tokenizer.save_pretrained(save_directory) [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] ``` -## Stable Diffusion +## Diffusers models + +Make sure you have 🤗 Diffusers installed. + +To install `diffusers`: +```bash +pip install optimum[diffusers] +``` + + +### Stable Diffusion Stable Diffusion models can also be used when running inference with OpenVINO. When Stable Diffusion models -are exported to the OpenVINO format, they are decomposed into three components that are later combined during inference: +are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference: - The text encoder - The U-NET - The VAE encoder - The VAE decoder -Make sure you have 🤗 Diffusers installed. +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-to-image` | `OVStableDiffusionPipeline` | +| `image-to-image` | `OVStableDiffusionImg2ImgPipeline` | +| `inpaint` | `OVStableDiffusionInpaintPipeline` | -To install `diffusers`: -```bash -pip install optimum[diffusers] -``` -### Text-to-Image +#### Text-to-Image Here is an example of how you can load an OpenVINO Stable Diffusion model and run inference using OpenVINO Runtime: ```python @@ -208,7 +291,7 @@ In case you want to change any parameters such as the outputs height or width, y -### Text-to-Image with Textual Inversion +#### Text-to-Image with Textual Inversion Here is an example of how you can load an OpenVINO Stable Diffusion model with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime: @@ -248,7 +331,7 @@ The left image shows the generation result of original stable diffusion v1.5, th | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_with_textual_inversion.png) | -### Image-to-Image +#### Image-to-Image ```python import requests @@ -269,16 +352,15 @@ image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale= image.save("fantasy_landscape.png") ``` -## Stable Diffusion XL +### Stable Diffusion XL -Before using `OVtableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows: +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-to-image` | `OVStableDiffusionXLPipeline` | +| `image-to-image` | `OVStableDiffusionXLImg2ImgPipeline` | -```bash -pip install diffusers -pip install invisible-watermark>=0.2.0 -``` -### Text-to-Image +#### Text-to-Image Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using OpenVINO Runtime: @@ -296,7 +378,7 @@ image.save("train_station.png") |---|---| | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich_2.png) | -### Text-to-Image with Textual Inversion +#### Text-to-Image with Textual Inversion Here is an example of how you can load an SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime: @@ -338,7 +420,7 @@ The left image shows the generation result of the original SDXL base 1.0, the ri | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_with_textual_inversion.png) | -### Image-to-Image +#### Image-to-Image Here is an example of how you can load a PyTorch SDXL model, convert it to OpenVINO on-the-fly and run inference using OpenVINO Runtime for *image-to-image*: @@ -358,7 +440,7 @@ pipeline.save_pretrained("openvino-sd-xl-refiner-1.0") ``` -### Refining the image output +#### Refining the image output The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model. @@ -372,27 +454,3 @@ refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=Tr image = base(prompt=prompt, output_type="latent").images[0] image = refiner(prompt=prompt, image=image[None, :]).images[0] ``` - - - -## Supported tasks - -As shown in the table below, each task is associated with a class enabling to automatically load your model. - - -| Task | Auto Class | -|--------------------------------------|--------------------------------------| -| `text-classification` | `OVModelForSequenceClassification` | -| `token-classification` | `OVModelForTokenClassification` | -| `question-answering` | `OVModelForQuestionAnswering` | -| `audio-classification` | `OVModelForAudioClassification` | -| `image-classification` | `OVModelForImageClassification` | -| `feature-extraction` | `OVModelForFeatureExtraction` | -| `fill-mask` | `OVModelForMaskedLM` | -| `text-generation` | `OVModelForCausalLM` | -| `text2text-generation` | `OVModelForSeq2SeqLM` | -| `text-to-image` | `OVStableDiffusionPipeline` | -| `text-to-image` | `OVStableDiffusionXLPipeline` | -| `image-to-image` | `OVStableDiffusionImg2ImgPipeline` | -| `image-to-image` | `OVStableDiffusionXLImg2ImgPipeline` | -| `inpaint` | `OVStableDiffusionInpaintPipeline` | diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 866573dca9..0f51d3cb60 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -62,6 +62,27 @@ tokenizer.save_pretrained(save_dir) The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. +### Weights compression + +For large language models (LLMs), it is often beneficial to only quantize weights, and keep activations in floating point precision. This method does not require a calibration dataset. To enable weights compression, set the `weights_only` parameter of `OVQuantizer`: + +```python +from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM +from transformers import AutoModelForCausalLM + +save_dir = "int8_weights_compressed_model" +model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") +quantizer = OVQuantizer.from_pretrained(model, task="text-generation") +quantizer.quantize(save_directory=save_dir, weights_only=True) +``` + +To load the optimized model for inference: + +```python +optimized_model = OVModelForCausalLM.from_pretrained(save_dir) +``` + +Weights compression is enabled for PyTorch and OpenVINO models: the starting model can be an `AutoModelForCausalLM` or `OVModelForCausalLM` instance. ## Training-time optimization @@ -221,4 +242,4 @@ text = "He's a dreadful magician." outputs = cls_pipe(text) [{'label': 'NEGATIVE', 'score': 0.9840195178985596}] -``` \ No newline at end of file +``` diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 782aa0bc0d..cb011706c8 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -18,7 +18,7 @@ from typing import Any, Callable, Dict, Optional, Union from requests.exceptions import ConnectionError as RequestsConnectionError -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoTokenizer from optimum.exporters import TasksManager from optimum.exporters.onnx import __main__ as optimum_main @@ -27,7 +27,6 @@ from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from ...intel.utils.import_utils import is_nncf_available -from ...intel.utils.modeling_utils import patch_decoder_attention_mask from .convert import export_models @@ -137,6 +136,41 @@ def main_export( original_task = task task = TasksManager.map_from_synonym(task) + # Patch the modules to export of GPTQ models w/o GPU + do_gptq_patching = False + try: + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + config_dict = config.to_dict() + quantization_config = config_dict.get("quantization_config", None) + do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" + except Exception: + pass + + if do_gptq_patching: + import torch + + torch.set_default_dtype(torch.float32) + orig_cuda_check = torch.cuda.is_available + torch.cuda.is_available = lambda: True + + from optimum.gptq import GPTQQuantizer + + orig_post_init_model = GPTQQuantizer.post_init_model + + def post_init_model(self, model): + from auto_gptq import exllama_set_max_input_length + + class StoreAttr(object): + pass + + model.quantize_config = StoreAttr() + model.quantize_config.desc_act = self.desc_act + if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + model = exllama_set_max_input_length(model, self.max_input_length) + return model + + GPTQQuantizer.post_init_model = post_init_model + framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) # get the shapes to be used to generate dummy inputs @@ -222,24 +256,18 @@ def main_export( preprocessors = maybe_load_preprocessors( model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code ) - if not task.startswith("text-generation"): - onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( - model=model, - task=task, - monolith=False, - custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, - custom_architecture=custom_architecture, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - _variant="default", - ) - else: - # TODO : ModelPatcher will be added in next optimum release - model = patch_decoder_attention_mask(model) - onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task) - onnx_config = onnx_config_constructor(model.config) - models_and_onnx_configs = {"model": (model, onnx_config)} + onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( + model=model, + task=task, + monolith=False, + custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, + custom_architecture=custom_architecture, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + _variant="default", + legacy=False, + ) if int8 is None: int8 = False @@ -324,3 +352,8 @@ def main_export( int8=int8, model_kwargs=model_kwargs, ) + + # Unpatch modules after GPTQ export + if do_gptq_patching: + torch.cuda.is_available = orig_cuda_check + GPTQQuantizer.post_init_model = orig_post_init_model diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py index b4c41e0be1..bbfc3db63d 100644 --- a/optimum/intel/generation/modeling.py +++ b/optimum/intel/generation/modeling.py @@ -26,12 +26,13 @@ from transformers.utils import WEIGHTS_NAME from optimum.exporters import TasksManager +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_torch_version, is_transformers_version -from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask +from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask if is_transformers_version("<", "4.25.0"): @@ -47,43 +48,29 @@ def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = Fals task = _TASK_ALIASES.get(task, task) signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__) onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task) - onnx_config = onnx_config_class(model.config) - if task == "text-generation" and use_cache: - onnx_config = onnx_config_class(model.config, use_past=True, use_past_in_inputs=True) + if "text-generation" in task: + onnx_config = onnx_config_class(model.config, use_past=use_cache, use_past_in_inputs=use_cache) + else: + onnx_config = onnx_config_class(model.config) + dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") - model_inputs = {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None} - if task == "text-generation" and use_cache and model.config.model_type != "gpt_bigcode": - # WA jit.trace issue of model like llama in https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L464, or else, generation output will be incorrect - pkv = [] - for i in range(len(model_inputs["past_key_values"])): - pkv.append([]) - for j in range(len(model_inputs["past_key_values"][0])): - pkv[i].append(model_inputs["past_key_values"][i][j].to(model.dtype)) - pkv[i] = tuple(pkv[i]) - model_inputs["past_key_values"] = tuple(pkv) - i = model_inputs["input_ids"] - a = model_inputs["attention_mask"] - model_inputs["input_ids"] = torch.cat([torch.zeros(i.shape[0], 1), i], -1).to(i.dtype) - model_inputs["attention_mask"] = torch.cat([torch.zeros(a.shape[0], 1), a], -1).to(a.dtype) - return model_inputs + + return {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None} def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False): model_inputs = prepare_jit_inputs(model, task, use_cache) # check if the model_inputs is correct. model(**model_inputs) + torch._C._jit_set_texpr_fuser_enabled(False) if "past_key_values" in model_inputs.keys(): model.config.return_dict = False - if is_torch_version(">", "2.0.1"): - traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) - else: - traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + if is_torch_version(">=", "2.1.0"): + traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) else: - if is_torch_version(">=", "2.0.0"): - traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) - else: - traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + traced_model = torch.jit.freeze(traced_model.eval()) traced_model(**model_inputs) traced_model(**model_inputs) @@ -91,11 +78,7 @@ def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False): return traced_model -class PreTrainedModel(OptimizedModel): - pass - - -class BaseModelForCausalLM(PreTrainedModel, GenerationMixin): +class BaseModelForCausalLM(OptimizedModel, GenerationMixin): auto_model_class = AutoModelForCausalLM export_feature = "text-generation" main_input_name = "input_ids" @@ -156,12 +139,23 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) + position_ids = kwargs.get("position_ids", None) + + attention_mask = kwargs.get("attention_mask", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + return { "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": self.use_cache, - "position_ids": None, - "attention_mask": kwargs.get("attention_mask", None), + "position_ids": position_ids, + "attention_mask": attention_mask, "token_type_ids": None, } @@ -258,6 +252,7 @@ def forward( input_ids: torch.LongTensor = None, attention_mask: Optional[torch.FloatTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + position_ids: Optional[torch.FloatTensor] = None, **kwargs, ) -> CausalLMOutputWithPast: if attention_mask is None: @@ -268,43 +263,42 @@ def forward( "attention_mask": attention_mask, } + model_type = self.config.model_type.replace("_", "-") + if self.use_cache: if past_key_values is None: nb_pkv = 2 num_layers = self.normalized_config.num_layers - num_attention_heads = self.normalized_config.num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(self.normalized_config, "num_key_value_heads"): - num_key_value_heads = self.normalized_config.num_key_value_heads - hidden_size = self.normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - if self.config.model_type == "gpt_bigcode": - new_shape = [input_ids.shape[0], 0, d_k * 2] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - past_key_values = tuple([empty_tensor] * num_layers) - elif self.config.model_type != "bloom": - new_shape = [input_ids.shape[0], num_key_value_heads, 0, d_k] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - pkv = tuple(empty_tensor for _ in range(nb_pkv)) + d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads + batch_size = input_ids.shape[0] + + if model_type in {"mistral", "llama"}: + num_attention_heads = self.normalized_config.num_key_value_heads else: - pkv = () - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_ids.shape[0] * num_key_value_heads, d_k, 0] - else: - new_shape = [input_ids.shape[0] * num_key_value_heads, 0, d_k] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - pkv = pkv + (empty_tensor,) - if past_key_values is None: - past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) + num_attention_heads = self.normalized_config.num_attention_heads + + if model_type == "bloom": + shape_key = (batch_size * num_attention_heads, d_k, 0) + shape_value = (batch_size * num_attention_heads, 0, d_k) + key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device) + value = torch.empty(size=shape_value, dtype=self.model_dtype, device=self._device) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) for _ in range(num_layers) + ) + elif model_type.replace("-", "_") in MULTI_QUERY_ATTN_MODELS: + shape = (batch_size, 0, d_k * 2) + pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device) + past_key_values = tuple(pkv for _ in range(num_layers)) + else: + shape = (batch_size, num_attention_heads, 0, d_k) + pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device) + past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers)) inputs["past_key_values"] = past_key_values + + if position_ids is not None and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + inputs["position_ids"] = position_ids + outputs = self.model(**inputs) if isinstance(outputs, (list, tuple)): @@ -389,7 +383,7 @@ def _from_transformers( torch_dtype: Optional[Union[str, "torch.dtype"]] = None, **kwargs, ): - if is_torch_version("<", "2.0.0"): + if is_torch_version("<", "2.1.0"): raise ImportError("`torch>=2.0.0` is needed to trace your model") task = cls.export_feature @@ -405,12 +399,7 @@ def _from_transformers( } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) - - if model.config.model_type == "bloom": - model.transformer._prepare_attn_mask = _prepare_attn_mask - - if model.config.model_type == "llama": - model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask + model = patch_decoder_attention_mask(model) traced_model = jit_trace(model, task, use_cache) save_dir = TemporaryDirectory() diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 36f16524c2..d4846adc15 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -15,6 +15,7 @@ import copy import inspect import logging +import warnings from enum import Enum from itertools import chain from pathlib import Path @@ -30,16 +31,25 @@ from neural_compressor.quantization import fit from torch.utils.data import DataLoader, RandomSampler from transformers import ( + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoModelForMultipleChoice, + AutoModelForQuestionAnswering, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelForVision2Seq, DataCollator, PretrainedConfig, PreTrainedModel, + XLNetLMHeadModel, default_data_collator, ) from optimum.exporters import TasksManager from optimum.exporters.onnx import OnnxConfig from optimum.onnxruntime import ORTModel -from optimum.onnxruntime.modeling_decoder import ORTModelDecoder +from optimum.onnxruntime.modeling_decoder import ORTModelForCausalLM from optimum.onnxruntime.modeling_seq2seq import ORTModelForConditionalGeneration from optimum.onnxruntime.utils import ONNX_DECODER_NAME from optimum.quantization_base import OptimumQuantizer @@ -256,7 +266,7 @@ def quantize( if isinstance(self._original_model, ORTModelForConditionalGeneration): raise RuntimeError("ORTModelForConditionalGeneration not supported for quantization") - if isinstance(self._original_model, ORTModelDecoder): + if isinstance(self._original_model, ORTModelForCausalLM): model_or_path = self._original_model.onnx_paths if len(model_or_path) > 1: raise RuntimeError( @@ -528,3 +538,49 @@ def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> t q_model = convert(q_model, mapping=q_mapping, inplace=True) return q_model + + +class IncQuantizedModel(INCModel): + @classmethod + def from_pretrained(cls, *args, **kwargs): + warnings.warn( + f"The class `{cls.__name__}` has been depreciated and will be removed in optimum-intel v1.12, please use " + f"`{cls.__name__.replace('IncQuantized', 'INC')}` instead." + ) + return super().from_pretrained(*args, **kwargs) + + +class IncQuantizedModelForQuestionAnswering(IncQuantizedModel): + auto_model_class = AutoModelForQuestionAnswering + + +class IncQuantizedModelForSequenceClassification(IncQuantizedModel): + auto_model_class = AutoModelForSequenceClassification + + +class IncQuantizedModelForTokenClassification(IncQuantizedModel): + auto_model_class = AutoModelForTokenClassification + + +class IncQuantizedModelForMultipleChoice(IncQuantizedModel): + auto_model_class = AutoModelForMultipleChoice + + +class IncQuantizedModelForSeq2SeqLM(IncQuantizedModel): + auto_model_class = AutoModelForSeq2SeqLM + + +class IncQuantizedModelForCausalLM(IncQuantizedModel): + auto_model_class = AutoModelForCausalLM + + +class IncQuantizedModelForMaskedLM(IncQuantizedModel): + auto_model_class = AutoModelForMaskedLM + + +class IncQuantizedModelForXLNetLM(IncQuantizedModel): + auto_model_class = XLNetLMHeadModel + + +class IncQuantizedModelForVision2Seq(IncQuantizedModel): + auto_model_class = AutoModelForVision2Seq diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 8e8fec1758..918a2e4885 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -15,12 +15,21 @@ import copy import math import os +import shutil import sys import time from collections.abc import Mapping from itertools import chain from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union + +# Integrations must be imported before ML frameworks: +# isort: off +from transformers.integrations import hp_params +from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available + +# isort: on + import datasets import torch import torch.distributed as dist @@ -28,38 +37,35 @@ from neural_compressor.compression import DistillationCallbacks from neural_compressor.conf.pythonic_config import _BaseQuantizationConfig from neural_compressor.experimental.export import torch_to_fp32_onnx, torch_to_int8_onnx - -# from packaging import version +from packaging import version from torch import nn from torch.utils.data import Dataset, RandomSampler -from torch.utils.data.dataloader import DataLoader -from torch.utils.data.distributed import DistributedSampler -from tqdm.auto import tqdm from transformers import Trainer from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.deepspeed import deepspeed_init -from transformers.file_utils import WEIGHTS_NAME - -# Integrations must be imported before ML frameworks: -from transformers.integrations import hp_params from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype, unwrap_model from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import TRAINER_STATE_NAME from transformers.trainer_callback import TrainerCallback, TrainerState -from transformers.trainer_pt_utils import IterableDatasetShard +from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count from transformers.trainer_utils import ( EvalPrediction, HPSearchBackend, - ShardedDDPOption, TrainOutput, has_length, speed_metrics, ) -from transformers.training_args import TrainingArguments -from transformers.utils import is_apex_available, is_sagemaker_mp_enabled, logging +from transformers.training_args import ParallelMode, TrainingArguments +from transformers.utils import ( + WEIGHTS_NAME, + is_accelerate_available, + is_apex_available, + is_sagemaker_mp_enabled, + is_torch_tpu_available, + logging, +) from optimum.exporters import TasksManager @@ -68,12 +74,31 @@ from .configuration import INCConfig +if is_accelerate_available(): + from accelerate import __version__ as accelerate_version + from accelerate import skip_first_batches + + if version.parse(accelerate_version) > version.parse("0.20.3"): + pass + DATA_SAMPLERS = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + DATA_SAMPLERS += [SeedableRandomSampler] + + if is_deepspeed_available(): + pass + + if is_apex_available(): from apex import amp if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp +if is_torch_tpu_available(check_device=False): + import torch_xla.core.xla_model as xm + if TYPE_CHECKING: from optimum.exporters.onnx import OnnxConfig @@ -109,6 +134,8 @@ def __init__( task: Optional[str] = None, save_onnx_model: bool = False, ): + self.neftune_noise_alpha = None + super().__init__( model, args, @@ -178,7 +205,9 @@ def __init__( def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): + self.accelerator.free_memory() self._train_batch_size = batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -186,9 +215,10 @@ def _inner_training_loop( # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps - total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size len_dataloader = None + num_train_tokens = None if has_length(train_dataloader): len_dataloader = len(train_dataloader) num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps @@ -230,58 +260,106 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or is_sagemaker_mp_enabled() - or self.fsdp is not None - ) - if args.deepspeed: - deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( - self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint - ) - self.model = deepspeed_engine.module - self.model_wrapped = deepspeed_engine - self.deepspeed = deepspeed_engine - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - elif not delay_optimizer_creation: + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + # Activate gradient checkpointing if needed if args.gradient_checkpointing: - self.model.gradient_checkpointing_enable() + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model = self._wrap_model(self.model_wrapped) - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - self._load_from_checkpoint(resume_from_checkpoint, model) + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model - if delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) # important: at this point: # self.model is the Transformers Model - # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. # Train! logger.info("***** Running training *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Num Epochs = {num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {max_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") self.state.epoch = 0 start_time = time.time() @@ -306,20 +384,19 @@ def _inner_training_loop( logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: logger.info( - f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " - "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` " - "flag to your launch command, but you will resume the training on data already seen by your model." + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." ) - if self.is_local_process_zero() and not args.disable_tqdm: - steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) - steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader - self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) if trial is not None: assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial self.state.trial_params = hp_params(assignments) @@ -347,26 +424,26 @@ def _inner_training_loop( # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( - train_dataloader.sampler, RandomSampler - ) + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. - # That was before PyTorch 1.11 however... for _ in train_dataloader: break else: # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! - _ = list(train_dataloader.sampler) + sampler = sampler if sampler is not None else [] + _ = list(sampler) + total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): - train_dataloader.sampler.set_epoch(epoch) - elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): - train_dataloader.dataset.set_epoch(epoch) - epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: @@ -385,8 +462,21 @@ def _inner_training_loop( if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + step = -1 for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -404,18 +494,14 @@ def _inner_training_loop( if self._compression_manager is not None: self._compression_manager.callbacks.on_step_begin(step) - if ( - ((step + 1) % args.gradient_accumulation_steps != 0) - and args.local_rank != -1 - and args._no_sync_in_gradient_accumulation - ): - # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. - with model.no_sync(): - tr_loss_step = self.training_step(model, inputs) - else: + with self.accelerator.accumulate(model): tr_loss_step = self.training_step(model, inputs) - if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): + if ( + args.logging_nan_inf_filter + and not is_torch_tpu_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: @@ -423,35 +509,38 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps - if self.deepspeed: - self.deepspeed.step() + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) - if (step + 1) % args.gradient_accumulation_steps == 0 or ( + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or # last step in epoch but step is always smaller than gradient_accumulation_steps - steps_in_epoch <= args.gradient_accumulation_steps - and (step + 1) == steps_in_epoch + is_last_step_and_steps_less_than_grad_acc ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc or ( + version.parse(accelerate_version) <= version.parse("0.20.3") + ): + self.accelerator.gradient_state._set_sync_gradients(True) + # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: + if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping - if self.do_grad_scaling: - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - if is_sagemaker_mp_enabled() and args.fp16: self.optimizer.clip_master_grads(args.max_grad_norm) - elif hasattr(self.optimizer, "clip_grad_norm"): - # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - self.optimizer.clip_grad_norm(args.max_grad_norm) - elif hasattr(model, "clip_grad_norm_"): - # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - model.clip_grad_norm_(args.max_grad_norm) - else: + elif self.use_apex: # Revert to normal clipping otherwise, handling Apex or full precision nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer) if self.use_apex else model.parameters(), + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + self.accelerator.clip_grad_norm_( + model.parameters(), args.max_grad_norm, ) @@ -459,27 +548,20 @@ def _inner_training_loop( self._compression_manager.callbacks.on_before_optimizer_step() # Optimizer step - optimizer_was_run = True - if self.deepspeed: - pass # called outside the loop - elif self.do_grad_scaling: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() + self.optimizer.step() if self._compression_manager is not None: self._compression_manager.callbacks.on_after_optimizer_step() - if optimizer_was_run and not self.deepspeed: - self.lr_scheduler.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 - self.state.epoch = epoch + (step + 1) / steps_in_epoch + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) if self._compression_manager is not None: self._compression_manager.callbacks.on_step_end() @@ -501,7 +583,6 @@ def _inner_training_loop( self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) if self._compression_manager is not None: self._compression_manager.callbacks.on_epoch_end() - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) if self.control.should_training_stop: @@ -513,9 +594,10 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # Wait for everyone to get here so we are sur the model has been saved by process 0. - - if args.local_rank != -1: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_tpu_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() elif is_sagemaker_mp_enabled(): smp.barrier() @@ -526,7 +608,13 @@ def _inner_training_loop( self._total_loss_scalar += tr_loss.item() train_loss = self._total_loss_scalar / self.state.global_step - metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss @@ -537,7 +625,26 @@ def _inner_training_loop( self.log(metrics) + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + if self._compression_manager is not None: self._compression_manager.callbacks.on_train_end() diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index a0c753d94d..8cea5eb7b6 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -49,7 +49,7 @@ from optimum.exporters import TasksManager -from ..utils.import_utils import is_timm_available +from ..utils.import_utils import is_timm_available, is_timm_version from .modeling_base import OVBaseModel from .utils import _is_timm_ov_dir @@ -540,6 +540,11 @@ def from_pretrained( "To load a timm model, timm needs to be installed. Please install it with `pip install timm`." ) + if is_timm_version("<", "0.9.0"): + raise ImportError( + "To load a timm model, please make sure to upgrade your `timm` version to at least 0.9.0, you can upgrade it by running `pip install --upgrade timm`" + ) + from .modeling_timm import TimmConfig, TimmForImageClassification, TimmOnnxConfig config = TimmConfig.from_pretrained(model_id, **kwargs) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 9384477eb9..67e8d20502 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -15,7 +15,7 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union import openvino @@ -43,17 +43,12 @@ logger = logging.getLogger(__name__) -# workaround to enable compatibility between openvino models and transformers pipelines -class PreTrainedModel(OptimizedModel): - pass - - @add_start_docstrings( """ Base OVModel class. """, ) -class OVBaseModel(PreTrainedModel): +class OVBaseModel(OptimizedModel): auto_model_class = None export_feature = None @@ -86,6 +81,12 @@ def __init__( input_names[next((name for name in names if "/" not in name), names[0])] = idx self.input_names = input_names + output_names = {} + for idx, key in enumerate(model.outputs): + names = tuple(key.get_names()) + output_names[next((name for name in names if "/" not in name), names[0])] = idx + self.output_names = output_names + self.model = model self.request = None if enable_compilation: @@ -302,7 +303,7 @@ def _from_transformers( @classmethod def _to_load( cls, - model: PreTrainedModel, + model, config: PretrainedConfig, onnx_config: OnnxConfig, use_auth_token: Optional[Union[bool, str]] = None, @@ -339,11 +340,11 @@ def compile(self): if self.request is None: logger.info(f"Compiling the model to {self._device} ...") ov_config = {**self.ov_config} - if "CACHE_DIR" not in self.ov_config.keys(): - # Set default CACHE_DIR only if it is not set. + if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + # Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory cache_dir = Path(self.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) - logger.info(f"Set CACHE_DIR to {str(cache_dir)}") + logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") self.request = core.compile_model(self.model, self._device, ov_config) def _reshape( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 0e018f9f62..0fa21e3a4a 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -129,7 +129,6 @@ def __init__( self.main_input_name = "input_ids" self.num_pkv = 2 self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) - self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization @@ -229,34 +228,6 @@ def _from_transformers( if use_cache: task = task + "-with-past" - # Patch the modules to export of GPTQ models w/o GPU - do_gptq_patching = False - config_dict = config.to_dict() - quantization_config = config_dict.get("quantization_config", None) - do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" - if do_gptq_patching: - torch.set_default_dtype(torch.float32) - orig_cuda_check = torch.cuda.is_available - torch.cuda.is_available = lambda: True - - from optimum.gptq import GPTQQuantizer - - orig_post_init_model = GPTQQuantizer.post_init_model - - def post_init_model(self, model): - from auto_gptq import exllama_set_max_input_length - - class StoreAttr(object): - pass - - model.quantize_config = StoreAttr() - model.quantize_config.desc_act = self.desc_act - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: - model = exllama_set_max_input_length(model, self.max_input_length) - return model - - GPTQQuantizer.post_init_model = post_init_model - main_export( model_name_or_path=model_id, output=save_dir_path, @@ -271,11 +242,6 @@ class StoreAttr(object): int8=load_in_8bit, ) - # Unpatch modules after GPTQ export - if do_gptq_patching: - torch.cuda.is_available = orig_cuda_check - GPTQQuantizer.post_init_model = orig_post_init_model - config.is_decoder = True config.is_encoder_decoder = False config.save_pretrained(save_dir_path) @@ -346,6 +312,7 @@ def forward( input_ids: torch.LongTensor, attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> CausalLMOutputWithPast: self.compile() @@ -395,14 +362,28 @@ def forward( inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: + if "attention_mask" in self.input_names or "position_ids" in self.input_names: if attention_mask is not None: - inputs["attention_mask"] = np.array(attention_mask) + attention_mask = np.array(attention_mask) else: - inputs["attention_mask"] = np.ones( + attention_mask = np.ones( (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype ) + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + if "position_ids" in self.input_names: + if position_ids is not None: + position_ids = np.array(position_ids) + else: + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + if past_key_values: + position_ids = np.expand_dims(position_ids[:, -1], axis=-1) + + inputs["position_ids"] = position_ids + # Run inference self.request.start_async(inputs, shared_memory=True) self.request.wait() @@ -504,7 +485,7 @@ def _from_pretrained( elif model_type == "gpt-bigcode": init_cls = OVGPTBigCodeForCausalLM else: - init_cls = OVModelForCausalLM + init_cls = cls return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2d0a31b428..4dc554fad8 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -17,7 +17,7 @@ import os import shutil from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Any, Dict, List, Optional, Union import numpy as np @@ -531,10 +531,8 @@ def __init__( self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - - # TODO : disable if self._model_dir tmp directory - if "CACHE_DIR" not in self.ov_config: - self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name) + if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()): + self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") def _compile(self): if self.request is None: diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 87cd18d875..6b759054d0 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -14,6 +14,7 @@ import logging from pathlib import Path +from tempfile import gettempdir from typing import Dict, Optional, Tuple import numpy as np @@ -202,31 +203,32 @@ def __init__( self.decoder_with_past = None enable_compilation = kwargs.get("compile", True) encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache") - encoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_encoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(encoder_cache_dir)} - ) + ov_encoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir) + self.encoder = OVEncoder( self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name ) + decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache") - decoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_cache_dir)} - ) + ov_decoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir) + self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config) + if self.use_cache: decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache") - decoder_past_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_past_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_past_cache_dir)} - ) + ov_decoder_past_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith( + gettempdir() + ): + ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir) + self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config) if enable_compilation: self.compile() diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index bcc7c2908b..b94f61214d 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -39,7 +39,6 @@ from ...exporters.openvino import export, export_pytorch_via_onnx from ..utils.constant import _TASK_ALIASES -from ..utils.modeling_utils import patch_decoder_attention_mask from .configuration import OVConfig from .modeling_base import OVBaseModel from .modeling_decoder import OVBaseDecoderModel @@ -394,9 +393,10 @@ def _quantize_torchmodel( task = self.task model = self.model self.model.config.save_pretrained(save_directory) - model = patch_decoder_attention_mask(model) - if task == "text-generation": - onnx_config = onnx_config_class(model.config, use_past=model.config.use_cache) + if task.startswith("text-generation"): + onnx_config = onnx_config_class( + model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache + ) else: onnx_config = onnx_config_class(model.config) diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 0bba054ad3..dfc659882c 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -16,6 +16,7 @@ import io import math import os +import shutil import sys import time from collections import defaultdict @@ -23,8 +24,15 @@ from pathlib import Path from typing import Callable, Dict, List, Optional, Tuple, Type, Union + +# Integrations must be imported before ML frameworks: +# isort: off +from transformers.integrations import hp_params +from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available + +# isort: on + import openvino -import openvino.runtime import torch import torch.distributed as dist import torch.nn.functional as F @@ -46,40 +54,39 @@ compress_quantize_weights_transformation, ) from openvino.runtime import Core, PartialShape, save_model +from packaging import version +from torch import nn from torch.onnx import export as onnx_export from torch.utils._pytree import tree_map -from torch.utils.data import DataLoader, Dataset, RandomSampler -from torch.utils.data.distributed import DistributedSampler -from tqdm.auto import tqdm +from torch.utils.data import Dataset, RandomSampler from transformers import Trainer from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.deepspeed import deepspeed_init -from transformers.integrations import hp_params from transformers.modeling_utils import PreTrainedModel, unwrap_model from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import TRAINER_STATE_NAME, TRAINING_ARGS_NAME from transformers.trainer_callback import TrainerCallback, TrainerState -from transformers.trainer_pt_utils import IterableDatasetShard +from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count from transformers.trainer_utils import ( EvalPrediction, HPSearchBackend, - ShardedDDPOption, TrainOutput, has_length, speed_metrics, ) +from transformers.training_args import ParallelMode from transformers.utils import ( WEIGHTS_NAME, + is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, is_torch_tpu_available, logging, ) +from optimum.exporters import TasksManager from optimum.exporters.onnx import OnnxConfig -from optimum.exporters.tasks import TasksManager from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version @@ -95,6 +102,22 @@ ) +if is_accelerate_available(): + from accelerate import __version__ as accelerate_version + from accelerate import skip_first_batches + + if version.parse(accelerate_version) > version.parse("0.20.3"): + pass + DATA_SAMPLERS = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + DATA_SAMPLERS += [SeedableRandomSampler] + + if is_deepspeed_available(): + pass + + if is_apex_available(): from apex import amp @@ -171,6 +194,8 @@ def __init__( task: Optional[str] = None, feature: Optional[str] = None, ): + self.neftune_noise_alpha = None + super().__init__( model, args, @@ -244,7 +269,9 @@ def _set_signature_columns_if_needed(self): def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): + self.accelerator.free_memory() self._train_batch_size = batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -252,9 +279,10 @@ def _inner_training_loop( # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps - total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size len_dataloader = None + num_train_tokens = None if has_length(train_dataloader): len_dataloader = len(train_dataloader) num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps @@ -268,10 +296,16 @@ def _inner_training_loop( # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's # the best we can do. num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = ( + self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + ) else: max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(args.num_train_epochs) num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size max_steps = args.max_steps # Setting a very large number of epochs so we go as many times as necessary over the iterator. @@ -279,6 +313,8 @@ def _inner_training_loop( num_update_steps_per_epoch = max_steps num_examples = total_train_batch_size * args.max_steps num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps else: raise ValueError( "args.max_steps must be set to a positive value if dataloader does not have a length, was" @@ -287,7 +323,7 @@ def _inner_training_loop( if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if self.args.n_gpu > 1: - # torch.nn.DataParallel(model) replicates the model, creating new variables and module + # nn.DataParallel(model) replicates the model, creating new variables and module # references registered here no longer work on other gpus, breaking the module raise ValueError( "Currently --debug underflow_overflow is not supported under DP. Please use DDP" @@ -296,30 +332,47 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or is_sagemaker_mp_enabled() - or self.fsdp is not None - ) - if args.deepspeed: - deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( - self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint - ) - self.model = deepspeed_engine.module - self.model_wrapped = deepspeed_engine - self.deepspeed = deepspeed_engine - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - elif not delay_optimizer_creation: + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled + + # We need to reset the scheduler, as its parameters may be different on subsequent calls + if self._created_lr_scheduler: + self.lr_scheduler = None + self._created_lr_scheduler = False + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + # Activate gradient checkpointing if needed if args.gradient_checkpointing: - self.model.gradient_checkpointing_enable() + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) if is_transformers_version("<", "4.29.0"): is_distributed = self.args.local_rank != -1 @@ -333,31 +386,67 @@ def _inner_training_loop( model = self._wrap_model(self.model_wrapped) - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - self._load_from_checkpoint(resume_from_checkpoint, model) + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model - if delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) # important: at this point: # self.model is the Transformers Model - # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. # Train! logger.info("***** Running training *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Num Epochs = {num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {max_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") self.state.epoch = 0 start_time = time.time() @@ -382,20 +471,19 @@ def _inner_training_loop( logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: logger.info( - f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " - "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` " - "flag to your launch command, but you will resume the training on data already seen by your model." + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." ) - if self.is_local_process_zero() and not args.disable_tqdm: - steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) - steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader - self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) if trial is not None: assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial self.state.trial_params = hp_params(assignments) @@ -408,6 +496,7 @@ def _inner_training_loop( self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() + # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(args.device) self.compression_metrics = defaultdict(lambda: torch.tensor(0.0).to(args.device)) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses @@ -420,31 +509,33 @@ def _inner_training_loop( # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( - train_dataloader.sampler, RandomSampler - ) + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. - # That was before PyTorch 1.11 however... for _ in train_dataloader: break else: - # Otherwise we need to call the whole sampler cause there is some random operation added + # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! - _ = list(train_dataloader.sampler) + sampler = sampler if sampler is not None else [] + _ = list(sampler) + total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): - train_dataloader.sampler.set_epoch(epoch) - elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): - train_dataloader.dataset.set_epoch(epoch) + epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: self._past = None steps_in_epoch = ( - len(train_dataloader) + len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps ) @@ -460,8 +551,21 @@ def _inner_training_loop( if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + step = -1 - for step, inputs in enumerate(train_dataloader): + for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -480,17 +584,14 @@ def _inner_training_loop( # Must be called at the beginning of each training step to prepare the compression method self.compression_controller.scheduler.step() + with self.accelerator.accumulate(model): + tr_loss_step = self.training_step(model, inputs) + if ( - ((step + 1) % args.gradient_accumulation_steps != 0) - and args.local_rank != -1 - and args._no_sync_in_gradient_accumulation + args.logging_nan_inf_filter + and not is_torch_tpu_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): - # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. - with model.no_sync(): - tr_loss_step = self.training_step(model, inputs) - else: - tr_loss_step = self.training_step(model, inputs) - if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: @@ -498,57 +599,52 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps - if self.deepspeed: - self.deepspeed.step() + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) - if (step + 1) % args.gradient_accumulation_steps == 0 or ( + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or # last step in epoch but step is always smaller than gradient_accumulation_steps - steps_in_epoch <= args.gradient_accumulation_steps - and (step + 1) == steps_in_epoch + is_last_step_and_steps_less_than_grad_acc ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc or ( + version.parse(accelerate_version) <= version.parse("0.20.3") + ): + self.accelerator.gradient_state._set_sync_gradients(True) + # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: + if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping - if self.do_grad_scaling: - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - if is_sagemaker_mp_enabled() and args.fp16: self.optimizer.clip_master_grads(args.max_grad_norm) - elif hasattr(self.optimizer, "clip_grad_norm"): - # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - self.optimizer.clip_grad_norm(args.max_grad_norm) - elif hasattr(model, "clip_grad_norm_"): - # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - model.clip_grad_norm_(args.max_grad_norm) - else: + elif self.use_apex: # Revert to normal clipping otherwise, handling Apex or full precision - torch.nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer) if self.use_apex else model.parameters(), + nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + self.accelerator.clip_grad_norm_( + model.parameters(), args.max_grad_norm, ) # Optimizer step - optimizer_was_run = True - if self.deepspeed: - pass # called outside the loop - elif self.do_grad_scaling: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() - - if optimizer_was_run and not self.deepspeed: - self.lr_scheduler.step() + self.optimizer.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 - self.state.epoch = epoch + (step + 1) / steps_in_epoch + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) @@ -559,7 +655,7 @@ def _inner_training_loop( break if step < 0: logger.warning( - "There seems to be not a single sample in your train_dataloader, stopping training at step" + "There seems to be not a single sample in your epoch_iterator, stopping training at step" f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" f" num_steps ({max_steps}) higher than the number of available samples." ) @@ -577,8 +673,10 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # Wait for everyone to get here so we are sur the model has been saved by process 0. - if args.local_rank != -1: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_tpu_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() elif is_sagemaker_mp_enabled(): smp.barrier() @@ -589,7 +687,13 @@ def _inner_training_loop( self._total_loss_scalar += tr_loss.item() train_loss = self._total_loss_scalar / self.state.global_step - metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss @@ -600,8 +704,26 @@ def _inner_training_loop( self.log(metrics) + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + return TrainOutput(self.state.global_step, train_loss, metrics) def compute_distillation_loss(self, inputs, student_outputs): @@ -705,12 +827,12 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): if state_dict is None: state_dict = self.model.state_dict() if is_pretrained_model: - unwrapped_model.save_pretrained(output_dir, state_dict=state_dict) + unwrapped_model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False) else: logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - self.model.save_pretrained(output_dir, state_dict=state_dict) + self.model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index d781f1f513..d15780384f 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -71,7 +71,10 @@ try: _openvino_version = importlib_metadata.version("openvino") except importlib_metadata.PackageNotFoundError: - _openvino_available = False + try: + _openvino_version = importlib_metadata.version("openvino-nightly") + except importlib_metadata.PackageNotFoundError: + _openvino_available = False _nncf_available = importlib.util.find_spec("nncf") is not None @@ -220,6 +223,15 @@ def is_ipex_version(operation: str, version: str): return compare_versions(parse(_ipex_version), operation, version) +def is_timm_version(operation: str, version: str): + """ + Compare the current timm version to a given reference with an operation. + """ + if not _timm_available: + return False + return compare_versions(parse(_timm_version), operation, version) + + DIFFUSERS_IMPORT_ERROR = """ {0} requires the diffusers library but it was not found in your environment. You can install it with pip: `pip install diffusers`. Please note that you may need to restart your runtime after installation. diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index b56e5e4f2d..1a3b6fbede 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -18,9 +18,6 @@ from transformers.modeling_utils import PreTrainedModel -# from ...utils.modeling_utils import _prepare_decoder_sliding_window_attention_mask - - MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"} @@ -98,6 +95,40 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, return combined_attention_mask +# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask +def _prepare_decoder_sliding_window_attention_mask( + attention_mask: torch.Tensor, + input_shape: Tuple[int, int], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: int, +): + from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + + combined_attention_mask = _make_sliding_window_causal_mask( + input_shape, + device=inputs_embeds.device, + dtype=inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + sliding_window=sliding_window, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def patch_decoder_attention_mask(model: "PreTrainedModel"): """ Apply patch on decoder with past model forward to resolve first inference based on model architecture @@ -112,8 +143,8 @@ def patch_decoder_attention_mask(model: "PreTrainedModel"): model.transformer._prepare_attn_mask = _prepare_attn_mask elif model.config.model_type == "llama": model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask - # elif model.config.model_type == "mistral": - # model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask + elif model.config.model_type == "mistral": + model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask elif model.config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}: model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask return model diff --git a/setup.py b/setup.py index 6d81b98b2a..7949f6d11d 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) INSTALL_REQUIRE = [ - "optimum>=1.13.0", + "optimum>=1.14.0", "transformers>=4.20.0", "datasets>=1.4.0", "sentencepiece", @@ -41,8 +41,9 @@ "neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0", + "transformers>=4.33.0", ], - "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"], + "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime", "transformers>=4.33.0"], "nncf": ["nncf>=2.6.0"], "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"], "diffusers": ["diffusers"], diff --git a/tests/generation/test_modeling.py b/tests/generation/test_modeling.py index 0fd668ad8f..db36b924f4 100644 --- a/tests/generation/test_modeling.py +++ b/tests/generation/test_modeling.py @@ -20,6 +20,7 @@ from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, pipeline, set_seed +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel.generation.modeling import TSModelForCausalLM @@ -28,6 +29,9 @@ "gptj": "hf-internal-testing/tiny-random-gptj", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "mistral": "echarlaix/tiny-random-mistral", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", } SEED = 42 @@ -48,7 +52,11 @@ class ModelingIntegrationTest(unittest.TestCase): "gpt2", "gptj", "gpt_neo", + "mistral", + "llama", + # "gpt_bigcode", ) + GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -61,7 +69,12 @@ def test_compare_to_transformers(self, model_arch): trfs_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer("This is a sample", return_tensors="pt") - outputs = model(**tokens) + + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + outputs = model(**tokens, position_ids=position_ids) self.assertIsInstance(outputs.logits, torch.Tensor) with torch.no_grad(): trfs_outputs = trfs_model(**tokens) @@ -71,7 +84,8 @@ def test_compare_to_transformers(self, model_arch): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) loaded_model = TSModelForCausalLM.from_pretrained(tmpdirname) - loaded_model_outputs = loaded_model(**tokens) + loaded_model_outputs = loaded_model(**tokens, position_ids=position_ids) + self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -120,7 +134,6 @@ def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["gpt2"] tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer("This is a sample input", return_tensors="pt") - model_with_pkv = TSModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) # Warmup _ = model_with_pkv.generate(**tokens) @@ -136,6 +149,9 @@ def test_compare_with_and_without_past_key_values(self): outputs_model_without_pkv = model_without_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) + self.assertTrue(model_with_pkv.use_cache) + self.assertFalse(model_without_pkv.use_cache) + self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index fc2a310595..8098f011c5 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -19,6 +19,7 @@ import unittest import torch +from packaging.version import Version, parse from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed @@ -39,6 +40,7 @@ INCTrainer, ) from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME +from optimum.version import __version__ as _optimum_version os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -133,6 +135,7 @@ def test_pipeline(self, model_id, task): pipe(*inputs) + @unittest.skipIf(parse(_optimum_version) < Version("1.14.0"), "not supported, needs optimum>=v1.14.0") def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/neural_compressor/test_onnx.py b/tests/neural_compressor/test_onnx.py index f5dc0b7c66..387c369dd1 100644 --- a/tests/neural_compressor/test_onnx.py +++ b/tests/neural_compressor/test_onnx.py @@ -54,7 +54,7 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls) tokenizer.pad_token = tokenizer.eos_token quantizer = INCQuantizer.from_pretrained(model, task=task) calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) - save_onnx_model = True + save_onnx_model = False op_type_dict = ( {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} if save_onnx_model diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index bf1a007844..c29e8c2eef 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -51,6 +51,7 @@ from transformers.onnx.utils import get_preprocessor from utils_tests import MODEL_NAMES +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, @@ -111,6 +112,19 @@ def test_load_from_hub_and_save_model(self): self.assertIsInstance(loaded_model.config, PretrainedConfig) loaded_model_outputs = loaded_model(**tokens) + # Test that model caching is automatically enabled + openvino_cache_dir = loaded_model.model_save_dir / "model_cache" + self.assertTrue(openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1) + + # Test specifying ov_config with throughput hint and manual cache dir + manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache" + ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"} + loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID, ov_config=ov_config) + self.assertTrue(manual_openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1) + self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT") + with tempfile.TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) @@ -120,6 +134,7 @@ def test_load_from_hub_and_save_model(self): outputs = model(**tokens) self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) + del loaded_model del model gc.collect() @@ -276,6 +291,10 @@ def test_pipeline(self, model_arch): self.assertTrue(not model.is_dynamic) self.assertGreaterEqual(outputs[0]["score"], 0.0) self.assertIsInstance(outputs[0]["label"], str) + # Test that model caching was not automatically enabled for exported model + openvino_cache_dir = model.model_save_dir / "model_cache" + self.assertFalse(openvino_cache_dir.is_dir()) + del model del pipe gc.collect() @@ -466,7 +485,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "pegasus", ) GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): @@ -479,7 +497,12 @@ def test_compare_to_transformers(self, model_arch): tokens = tokenizer( "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None ) - ov_outputs = ov_model(**tokens) + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + ov_outputs = ov_model(**tokens, position_ids=position_ids) + self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) with torch.no_grad(): @@ -539,29 +562,17 @@ def test_compare_with_and_without_past_key_values(self): tokens = tokenizer("This is a sample input", return_tensors="pt") model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) - # Warmup - _ = model_with_pkv.generate(**tokens) - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) model_without_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False) - - # Warmup - _ = model_without_pkv.generate(**tokens) - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + del model_with_pkv del model_without_pkv gc.collect() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c1ec95ea9b..3154ae1133 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -63,7 +63,7 @@ class OVQuantizerTest(unittest.TestCase): # TODO : add models SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 35), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 22), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 23), ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) @@ -145,7 +145,7 @@ class OVWeightCompressionTest(unittest.TestCase): # TODO : add models SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS = ( (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 35), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45, 22), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-BartForCausalLM", 27, 14), ) SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = ( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 958e6e23b3..fc2344a7ac 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -24,7 +24,7 @@ "bart": "hf-internal-testing/tiny-random-bart", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", - "blenderbot": "hf-internal-testing/tiny-random-blenderbot", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", @@ -103,7 +103,7 @@ "albert": (42,), "vit": (31,), "blenderbot": (35,), - "gpt2": (22,), + "gpt2": (23,), "wav2vec2": (15,), "distilbert": (33,), "t5": (32, 52, 42),