Skip to content

Commit 2e48edc

Browse files
committed
Merge remote-tracking branch 'upstream/main' into penghuic/fixed_UT_error
2 parents 0d51d64 + 478ad69 commit 2e48edc

22 files changed

+807
-122
lines changed

.github/workflows/delete_doc_comment.yml

Lines changed: 0 additions & 18 deletions
This file was deleted.

.github/workflows/delete_doc_comment_trigger.yml

Lines changed: 0 additions & 12 deletions
This file was deleted.

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,13 @@ It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2
7575
optimum-cli export openvino --model gpt2 ov_model
7676
```
7777

78-
If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision.
78+
If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
7979

8080
```plain
8181
optimum-cli export openvino --model gpt2 --int8 ov_model
8282
```
8383

84+
To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
8485

8586
#### Inference:
8687

docs/source/inference.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ You can also apply INT8 quantization on your models weights when exporting your
102102
optimum-cli export openvino --model gpt2 --int8 ov_model
103103
```
104104

105-
This will results in the exported model linear and embedding layers to be quanrtized to INT8, the activations will be kept in floating point precision.
105+
This will results in the exported model linear and embedding layers to be quantized to INT8, the activations will be kept in floating point precision.
106106

107107
This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method.
108108

optimum/exporters/openvino/__main__.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,19 @@
2626
from optimum.utils import DEFAULT_DUMMY_SHAPES
2727
from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
2828

29-
from ...intel.utils.import_utils import is_nncf_available
29+
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version
3030
from .convert import export_models
3131

3232

33+
if is_optimum_version(">=", "1.16.0"):
34+
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
35+
else:
36+
# Copied from https://github.com/huggingface/optimum/blob/main/optimum/exporters/onnx/constants.py
37+
SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [
38+
"bart",
39+
"whisper",
40+
]
41+
3342
OV_XML_FILE_NAME = "openvino_model.xml"
3443

3544
_MAX_UNCOMPRESSED_SIZE = 1e9
@@ -140,10 +149,12 @@ def main_export(
140149
do_gptq_patching = False
141150
try:
142151
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
152+
model_type = config.model_type.replace("_", "-")
143153
config_dict = config.to_dict()
144154
quantization_config = config_dict.get("quantization_config", None)
145155
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
146156
except Exception:
157+
model_type = None
147158
pass
148159

149160
if do_gptq_patching:
@@ -192,6 +203,10 @@ class StoreAttr(object):
192203
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
193204
)
194205

206+
loading_kwargs = {}
207+
if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
208+
loading_kwargs["attn_implementation"] = "eager"
209+
195210
model = TasksManager.get_model_from_task(
196211
task,
197212
model_name_or_path,
@@ -204,6 +219,7 @@ class StoreAttr(object):
204219
trust_remote_code=trust_remote_code,
205220
framework=framework,
206221
device=device,
222+
**loading_kwargs,
207223
)
208224

209225
custom_architecture = False

optimum/exporters/openvino/convert.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
3232
from optimum.utils import is_diffusers_available
3333

34-
from ...intel.utils.import_utils import is_nncf_available
34+
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
3535
from .utils import (
3636
OV_XML_FILE_NAME,
3737
clear_class_registry,
@@ -307,8 +307,10 @@ def export_pytorch(
307307
# model.config.torchscript = True can not be used for patching, because it overrides return_dict to Flase
308308
if custom_patcher or dict_inputs:
309309
patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
310-
# DecoderModelPatcher does not override model forward
311-
if isinstance(patcher, DecoderModelPatcher) or patcher.orig_forward_name != "forward":
310+
# DecoderModelPatcher does not override model forward in optimum < 1.15
311+
if (
312+
isinstance(patcher, DecoderModelPatcher) and is_optimum_version("<", "1.15.0")
313+
) or patcher.orig_forward_name != "forward":
312314
patch_model_forward = True
313315
patched_forward = model.forward
314316
else:

optimum/intel/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
"OVModelForPix2Struct",
100100
"OVModelForQuestionAnswering",
101101
"OVModelForSeq2SeqLM",
102+
"OVModelForSpeechSeq2Seq",
102103
"OVModelForSequenceClassification",
103104
"OVModelForTokenClassification",
104105
]
@@ -195,6 +196,7 @@
195196
OVModelForQuestionAnswering,
196197
OVModelForSeq2SeqLM,
197198
OVModelForSequenceClassification,
199+
OVModelForSpeechSeq2Seq,
198200
OVModelForTokenClassification,
199201
)
200202

optimum/intel/generation/modeling.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,24 @@
4444
logger = logging.getLogger(__name__)
4545

4646

47+
def get_float_type(model_dtype: torch.dtype):
48+
if model_dtype == torch.bfloat16:
49+
return "bf16"
50+
elif model_dtype == torch.float16:
51+
return "fp16"
52+
else:
53+
return "fp32"
54+
55+
4756
def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = False):
4857
task = _TASK_ALIASES.get(task, task)
4958
signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__)
5059
onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
60+
float_dtype = get_float_type(model.dtype)
5161
if "text-generation" in task:
52-
onnx_config = onnx_config_class(model.config, use_past=use_cache, use_past_in_inputs=use_cache)
62+
onnx_config = onnx_config_class(
63+
model.config, use_past=use_cache, use_past_in_inputs=use_cache, float_dtype=float_dtype
64+
)
5365
else:
5466
onnx_config = onnx_config_class(model.config)
5567

optimum/intel/neural_compressor/modeling_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def _from_pretrained(
164164
if q_config is None:
165165
model = model_class.from_pretrained(model_save_dir)
166166
else:
167-
init_contexts = [no_init_weights(_enable=True)]
167+
init_contexts = [no_init_weights(_enable=False)]
168168
with ContextManagers(init_contexts):
169169
model = model_class(config)
170170
try:

optimum/intel/neural_compressor/trainer.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
from optimum.exporters import TasksManager
7171

7272
from ..utils.constant import _TASK_ALIASES, MIN_QDQ_ONNX_OPSET, ONNX_WEIGHTS_NAME, TRAINING_ARGS_NAME
73-
from ..utils.import_utils import is_neural_compressor_version
73+
from ..utils.import_utils import is_neural_compressor_version, is_transformers_version
7474
from .configuration import INCConfig
7575

7676

@@ -207,6 +207,9 @@ def _inner_training_loop(
207207
):
208208
self.accelerator.free_memory()
209209
self._train_batch_size = batch_size
210+
211+
if self.args.auto_find_batch_size:
212+
self.state.train_batch_size = self._train_batch_size
210213
logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
211214
# Data loader and number of training steps
212215
train_dataloader = self.get_train_dataloader()
@@ -260,7 +263,10 @@ def _inner_training_loop(
260263
else:
261264
debug_overflow = DebugUnderflowOverflow(self.model) # noqa
262265

263-
delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
266+
is_fsdp_xla_enabled = (
267+
self.is_fsdp_xla_enabled if is_transformers_version(">=", "4.36.0") else self.fsdp is not None
268+
)
269+
delay_optimizer_creation = is_sagemaker_mp_enabled() or is_fsdp_xla_enabled or self.is_fsdp_enabled
264270

265271
if self.is_deepspeed_enabled:
266272
self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
@@ -270,6 +276,7 @@ def _inner_training_loop(
270276

271277
self.state = TrainerState()
272278
self.state.is_hyper_param_search = trial is not None
279+
self.state.train_batch_size = self._train_batch_size
273280

274281
# Compute absolute values for logging, eval, and save if given as ratio
275282
if args.logging_steps is not None:
@@ -305,7 +312,7 @@ def _inner_training_loop(
305312
use_accelerator_prepare = True if model is self.model else False
306313

307314
if delay_optimizer_creation:
308-
if use_accelerator_prepare:
315+
if is_transformers_version("<", "4.36.0") and use_accelerator_prepare:
309316
self.model = self.accelerator.prepare(self.model)
310317
self.create_optimizer_and_scheduler(num_training_steps=max_steps)
311318

@@ -473,6 +480,18 @@ def _inner_training_loop(
473480
step = -1
474481
for step, inputs in enumerate(epoch_iterator):
475482
total_batched_samples += 1
483+
484+
if is_transformers_version(">=", "4.36.0") and self.args.include_num_input_tokens_seen:
485+
main_input_name = getattr(self.model, "main_input_name", "input_ids")
486+
if main_input_name not in inputs:
487+
logger.warning(
488+
"Tried to track the number of tokens seen, however the current model is "
489+
"not configured properly to know what item is the input. To fix this, add "
490+
"a `main_input_name` attribute to the model class you are using."
491+
)
492+
else:
493+
self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
494+
476495
if rng_to_sync:
477496
self._load_rng_state(resume_from_checkpoint)
478497
rng_to_sync = False
@@ -521,9 +540,7 @@ def _inner_training_loop(
521540
):
522541
# the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
523542
# in accelerate. So, explicitly enable sync gradients to True in that case.
524-
if is_last_step_and_steps_less_than_grad_acc or (
525-
version.parse(accelerate_version) <= version.parse("0.20.3")
526-
):
543+
if is_last_step_and_steps_less_than_grad_acc:
527544
self.accelerator.gradient_state._set_sync_gradients(True)
528545

529546
# Gradient clipping

optimum/intel/openvino/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
OVModelForTokenClassification,
4747
)
4848
from .modeling_decoder import OVModelForCausalLM
49-
from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM
49+
from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq
5050

5151

5252
if is_diffusers_available():

optimum/intel/openvino/modeling_base_seq2seq.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,6 @@ def __init__(
6868
self.ov_config = ov_config if ov_config is not None else {}
6969
self.preprocessors = kwargs.get("preprocessors", [])
7070

71-
if "GPU" in self._device:
72-
raise ValueError("Support of dynamic shapes for GPU devices is not yet available.")
7371
if self.is_dynamic:
7472
encoder = self._reshape(encoder, -1, -1, is_decoder=False)
7573
decoder = self._reshape(decoder, -1, -1)

0 commit comments

Comments
 (0)