Enable OpenVINO export of loaded model

huggingface · Feb 9, 2024 · a51d02a · a51d02a
1 parent 0ece48b
commit a51d02a
Show file tree

Hide file tree

Showing 3 changed files with 314 additions and 213 deletions.
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -13,27 +13,22 @@
 #  limitations under the License.
 
 import logging
-import os
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoConfig, PreTrainedTokenizerBase
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 
 from optimum.exporters import TasksManager
-from optimum.exporters.onnx import __main__ as optimum_main
-from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
-from optimum.utils import DEFAULT_DUMMY_SHAPES
-from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
+from optimum.exporters.onnx.base import OnnxConfig
+from optimum.utils.save_utils import maybe_load_preprocessors
 
 from ...intel.utils.import_utils import (
-    is_nncf_available,
     is_openvino_tokenizers_available,
     is_optimum_version,
     is_transformers_version,
 )
-from .convert import export_models, export_tokenizer
-from .stateful import ensure_export_task_support_stateful
+from .convert import export_from_model, export_tokenizer
 
 
 if is_optimum_version(">=", "1.16.0"):
@@ -45,8 +40,6 @@
         "whisper",
     ]
 
-OV_XML_FILE_NAME = "openvino_model.xml"
-_MAX_UNCOMPRESSED_SIZE = 1e9
 
 logger = logging.getLogger(__name__)
 
@@ -143,70 +136,11 @@ def main_export(
     >>> main_export("gpt2", output="gpt2_onnx/")
     ```
     """
-    if (
-        compression_option is not None
-        and compression_option != "fp16"
-        and compression_option != "fp32"
-        and not is_nncf_available()
-    ):
-        raise ImportError(
-            f"Compression of the weights to {compression_option} requires nncf, please install it with `pip install nncf`"
-        )
-
-    model_kwargs = model_kwargs or {}
-
-    output = Path(output)
-    if not output.exists():
-        output.mkdir(parents=True)
 
     original_task = task
     task = TasksManager.map_from_synonym(task)
-
-    # Patch the modules to export of GPTQ models w/o GPU
-    do_gptq_patching = False
-    try:
-        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
-        model_type = config.model_type.replace("_", "-")
-        config_dict = config.to_dict()
-        quantization_config = config_dict.get("quantization_config", None)
-        do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
-    except Exception:
-        model_type = None
-        pass
-
-    if do_gptq_patching:
-        import torch
-
-        torch.set_default_dtype(torch.float32)
-        orig_cuda_check = torch.cuda.is_available
-        torch.cuda.is_available = lambda: True
-
-        from optimum.gptq import GPTQQuantizer
-
-        orig_post_init_model = GPTQQuantizer.post_init_model
-
-        def post_init_model(self, model):
-            from auto_gptq import exllama_set_max_input_length
-
-            class StoreAttr(object):
-                pass
-
-            model.quantize_config = StoreAttr()
-            model.quantize_config.desc_act = self.desc_act
-            if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
-                model = exllama_set_max_input_length(model, self.max_input_length)
-            return model
-
-        GPTQQuantizer.post_init_model = post_init_model
-
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
-
-    # get the shapes to be used to generate dummy inputs
-    input_shapes = {}
-    for input_name in DEFAULT_DUMMY_SHAPES.keys():
-        input_shapes[input_name] = (
-            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
-        )
+    library_name = TasksManager.infer_library_from_model(model_name_or_path, subfolder=subfolder)
 
     if task == "auto":
         try:
@@ -220,9 +154,44 @@ class StoreAttr(object):
                 f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
             )
 
+    if convert_tokenizer and not is_openvino_tokenizers_available():
+        logger.warning(
+            "`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
+        )
+        convert_tokenizer = False
+
+    custom_architecture = False
     loading_kwargs = {}
-    if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
-        loading_kwargs["attn_implementation"] = "eager"
+    if library_name == "transformers":
+        config = AutoConfig.from_pretrained(
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            use_auth_token=use_auth_token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+            trust_remote_code=trust_remote_code,
+        )
+        model_type = config.model_type.replace("_", "-")
+
+        if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+            custom_architecture = True
+        elif task not in TasksManager.get_supported_tasks_for_model_type(
+            model_type, exporter="onnx", library_name=library_name
+        ):
+            if original_task == "auto":
+                autodetected_message = " (auto-detected)"
+            else:
+                autodetected_message = ""
+            model_tasks = TasksManager.get_supported_tasks_for_model_type(
+                model_type, exporter="onnx", library_name=library_name
+            )
+            raise ValueError(
+                f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
+            )
+        if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
+            loading_kwargs["attn_implementation"] = "eager"
 
     model = TasksManager.get_model_from_task(
         task,
@@ -239,37 +208,35 @@ class StoreAttr(object):
         **loading_kwargs,
     )
 
-    custom_architecture = False
-    is_stable_diffusion = "stable-diffusion" in task
-    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
-
-    if not is_stable_diffusion:
-        if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
-            raise ValueError(
-                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
-                f"If you want to support {model_type} please propose a PR or open up an issue."
-            )
-        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
-            task, exporter="onnx"
-        ):
-            custom_architecture = True
+    needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
 
-    if custom_architecture and custom_onnx_configs is None:
-        raise ValueError(
-            "Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
-        )
+    if needs_pad_token_id:
+        if pad_token_id is not None:
+            model.config.pad_token_id = pad_token_id
+        else:
+            tok = AutoTokenizer.from_pretrained(model_name_or_path)
+            pad_token_id = getattr(tok, "pad_token_id", None)
+            if pad_token_id is None:
+                raise ValueError(
+                    "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
+                )
+            model.config.pad_token_id = pad_token_id
 
-    if custom_architecture and original_task == "auto":
-        raise ValueError(
-            f'Automatic task detection is not supported with custom architectures. Please specify the `task` argument. Suggestion: task="{task}" (or task="{task}-with-past" if the model is decoder-based and supports KV cache)'
-        )
+    if "stable-diffusion" in task:
+        model_type = "stable-diffusion"
+    elif hasattr(model.config, "export_model_type"):
+        model_type = model.config.export_model_type.replace("_", "-")
+    else:
+        model_type = model.config.model_type.replace("_", "-")
 
     if (
         not custom_architecture
-        and not is_stable_diffusion
-        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
+        and library_name != "diffusers"
+        and task + "-with-past"
+        in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx", library_name=library_name)
     ):
-        if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
+        # Make -with-past the default if --task was not explicitely specified
+        if original_task == "auto":
             task = task + "-with-past"
         else:
             logger.info(
@@ -286,127 +253,45 @@ class StoreAttr(object):
             possible_synonyms = ""
         logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
 
-    task_support_stateful = ensure_export_task_support_stateful(task)
-    stateful = stateful and task_support_stateful
-
     preprocessors = maybe_load_preprocessors(
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
 
-    onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
+    export_from_model(
         model=model,
+        output=output,
         task=task,
-        monolith=False,
-        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
-        custom_architecture=custom_architecture,
+        compression_option=compression_option,
+        compression_ratio=compression_ratio,
+        stateful=stateful,
+        model_kwargs=model_kwargs,
+        custom_onnx_configs=custom_onnx_configs,
         fn_get_submodels=fn_get_submodels,
         preprocessors=preprocessors,
-        _variant="default",
-        legacy=False,
+        device=device,
+        **kwargs_shapes,
     )
 
-    if compression_option is None:
-        num_parameters = model.num_parameters() if not is_stable_diffusion else model.unet.num_parameters()
-        if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
-            if is_nncf_available():
-                compression_option = "int8"
-                logger.info("The model weights will be quantized to int8.")
-            else:
-                logger.warning(
-                    "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                    "please install it with `pip install nncf`"
-                )
-
-    if not is_stable_diffusion:
-        needs_pad_token_id = (
-            isinstance(onnx_config, OnnxConfigWithPast)
-            and getattr(model.config, "pad_token_id", None) is None
-            and task in ["text-classification"]
-        )
-
-        tokenizer = next(
-            (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), None
-        )
+    if convert_tokenizer:
+        if library_name != "diffusers":
+            tokenizer = next(
+                (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
+                None,
+            )
 
-        if needs_pad_token_id:
-            if pad_token_id is not None:
-                model.config.pad_token_id = pad_token_id
-            elif tokenizer is not None:
+            if tokenizer is not None:
                 try:
-                    model.config.pad_token_id = tokenizer.pad_token_id
-                except Exception:
-                    raise ValueError(
-                        "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
+                    export_tokenizer(tokenizer, output)
+                except Exception as exception:
+                    logger.warning(
+                        "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
+                        f"models won't be generated. Exception: {exception}"
                     )
-        # Saving the model config and preprocessor as this is needed sometimes.
-        model.config.save_pretrained(output)
-        generation_config = getattr(model, "generation_config", None)
-        if generation_config is not None:
-            generation_config.save_pretrained(output)
-        maybe_save_preprocessors(model_name_or_path, output)
-
-        if convert_tokenizer and tokenizer is not None and is_openvino_tokenizers_available():
-            try:
-                export_tokenizer(tokenizer, output)
-            except Exception as exception:
-                logger.warning(
-                    "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
-                    f"models won't be generated. Exception: {exception}"
-                )
-
-        if model.config.is_encoder_decoder and task.startswith("text-generation"):
-            raise ValueError(
-                f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
-                f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
-                f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
-            )
-
-        files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_onnx_configs.keys()]
-    else:
-        # save the subcomponent configuration
-        for model_name in models_and_onnx_configs:
-            subcomponent = models_and_onnx_configs[model_name][0]
-            if hasattr(subcomponent, "save_config"):
-                subcomponent.save_config(output / model_name)
-            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
-                subcomponent.config.save_pretrained(output / model_name)
-
-        files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs]
-
-        # Saving the additional components needed to perform inference.
-        model.scheduler.save_pretrained(output.joinpath("scheduler"))
-
-        feature_extractor = getattr(model, "feature_extractor", None)
-        if feature_extractor is not None:
-            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
-
-        tokenizer = getattr(model, "tokenizer", None)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(output.joinpath("tokenizer"))
-            if convert_tokenizer and is_openvino_tokenizers_available():
+        else:
+            tokenizer = getattr(model, "tokenizer", None)
+            if tokenizer is not None:
                 export_tokenizer(tokenizer, output)
 
-        tokenizer_2 = getattr(model, "tokenizer_2", None)
-        if tokenizer_2 is not None:
-            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
-            if convert_tokenizer and is_openvino_tokenizers_available():
-                export_tokenizer(tokenizer, output, suffix="_2")
-
-        model.save_config(output)
-
-    export_models(
-        models_and_onnx_configs=models_and_onnx_configs,
-        output_dir=output,
-        output_names=files_subpaths,
-        input_shapes=input_shapes,
-        device=device,
-        compression_option=compression_option,
-        compression_ratio=compression_ratio,
-        stateful=stateful,
-        model_kwargs=model_kwargs,
-    )
-
-    # Unpatch modules after GPTQ export
-    if do_gptq_patching:
-        torch.cuda.is_available = orig_cuda_check
-        GPTQQuantizer.post_init_model = orig_post_init_model
+            tokenizer_2 = getattr(model, "tokenizer_2", None)
+            if tokenizer_2 is not None:
+                export_tokenizer(tokenizer_2, output, suffix="_2")