diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index 94ea4f103b..180017b35c 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -12,11 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + +import pkgutil + +from . import _compat_sam2 # noqa: F401 Ensures SAM2 patches are applied before registrations. import optimum.exporters.openvino.model_configs from .__main__ import main_export from .convert import export, export_from_model, export_models, export_pytorch_via_onnx from .stateful import ensure_stateful_is_available, patch_stateful +__path__ = pkgutil.extend_path(__path__, __name__) __all__ = ["main_export", "export", "export_models"] diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 23f94c7cad..a4a0d34ea3 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -44,6 +44,7 @@ clear_class_registry, deduce_diffusers_dtype, load_preprocessors, + resolve_model_type, ) @@ -274,7 +275,7 @@ def main_export( do_gptq_patching = quant_method == "gptq" do_bitnet_patching = quant_method == "bitnet" - model_type = config.model_type + model_type = resolve_model_type(config, task) if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True if custom_export_configs is None: @@ -446,10 +447,7 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs): ) model.config.pad_token_id = pad_token_id - if hasattr(model.config, "export_model_type"): - model_type = model.config.export_model_type - else: - model_type = model.config.model_type + model_type = resolve_model_type(model.config, task) if ( not custom_architecture diff --git a/optimum/exporters/openvino/_compat_sam2.py b/optimum/exporters/openvino/_compat_sam2.py new file mode 100644 index 0000000000..2535ee6bf0 --- /dev/null +++ b/optimum/exporters/openvino/_compat_sam2.py @@ -0,0 +1,122 @@ +"""SAM2 compatibility hooks for Optimum OpenVINO exporters.""" + +from __future__ import annotations + +import transformers + +try: # new Transformers no longer expose MT5Tokenizer + from transformers import MT5Tokenizer # type: ignore[attr-defined] +except ImportError: # transformers >= version dropping MT5Tokenizer + from transformers import T5Tokenizer + + class MT5Tokenizer(T5Tokenizer): # type: ignore[misc] + pass + + setattr(transformers, "MT5Tokenizer", MT5Tokenizer) + +_SAM2_ERROR_TOKEN = "positional_embedding" + + +def _patch_sam2_config(): + try: + from transformers.models.sam2.configuration_sam2 import Sam2Config # type: ignore + except Exception: + Sam2Config = None + + try: + from transformers.models.sam2_video.configuration_sam2_video import Sam2VideoConfig # type: ignore + except Exception: + Sam2VideoConfig = None + + def _guard(cfg_cls): + if cfg_cls is None or getattr(cfg_cls, "_optimum_config_patched", False): + return + original_init = cfg_cls.__init__ + + def patched_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + try: + if getattr(self, "tie_word_embeddings", True): + self.tie_word_embeddings = False + except Exception: + pass + + try: + model_type = getattr(self, "model_type", None) + if model_type == "sam2_video": + mapping = dict(getattr(self, "export_model_type_map", {}) or {}) + mapping.setdefault("feature-extraction", "sam2video_vision_encoder") + mapping.setdefault("image-segmentation", "sam2video_mask_decoder") + self.export_model_type_map = mapping + if getattr(self, "export_model_type", None) is None: + self.export_model_type = mapping.get("feature-extraction") + except Exception: + pass + + cfg_cls.__init__ = patched_init + setattr(cfg_cls, "_optimum_config_patched", True) + + _guard(Sam2Config) + _guard(Sam2VideoConfig) + + +def _patch_sam2_mark_tied_weights(): + try: + from transformers.models.sam2.modeling_sam2 import Sam2Model # type: ignore + except Exception: # transformers may not ship sam2 yet + Sam2Model = None + + try: + from transformers.models.sam2_video.modeling_sam2_video import Sam2VideoModel # type: ignore + except Exception: + Sam2VideoModel = None + + def _guard(model_cls): + if model_cls is None: + return + original = getattr(model_cls, "mark_tied_weights_as_initialized", None) + if original is None or getattr(model_cls, "_optimum_mark_tied_weights_patched", False): + return + + def patched(self, *args, **kwargs): + tied = getattr(self, "_tied_weights_keys", None) + if tied and not getattr(self, "_optimum_sam2_ties_filtered", False): + filtered = [] + removed = False + for pair in tied: + keys = pair if isinstance(pair, (list, tuple, set)) else (pair,) + if any((_SAM2_ERROR_TOKEN in str(key)) for key in keys if key): + removed = True + continue + filtered.append(pair) + if removed: + try: + self._tied_weights_keys = type(tied)(filtered) + except Exception: + self._tied_weights_keys = filtered + setattr(self, "_optimum_sam2_ties_filtered", True) + config = getattr(self, "config", None) + if config is not None and getattr(config, "tie_word_embeddings", None): + try: + config.tie_word_embeddings = False + except Exception: + pass + try: + return original(self, *args, **kwargs) + except AttributeError as err: + if _SAM2_ERROR_TOKEN in str(err): + # Tied metadata can sporadically include buffers; skip them quietly. + return + raise + + model_cls.mark_tied_weights_as_initialized = patched + setattr(model_cls, "_optimum_mark_tied_weights_patched", True) + + _guard(Sam2Model) + _guard(Sam2VideoModel) + + +_patch_sam2_config() +_patch_sam2_mark_tied_weights() + +__all__ = [] diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index be43181ea8..3499a672c8 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -69,6 +69,7 @@ allow_skip_tracing_check, clear_class_registry, remove_none_from_dummy_inputs, + resolve_model_type, save_config, save_preprocessors, set_simplified_chat_template, @@ -552,10 +553,7 @@ def export_from_model( if library_name != "open_clip": TasksManager.standardize_model_attributes(model) - if hasattr(model.config, "export_model_type") and model.config.export_model_type is not None: - model_type = model.config.export_model_type - else: - model_type = getattr(model.config, "model_type", None) or "" + model_type = resolve_model_type(model.config, task) custom_architecture = library_name == "transformers" and model_type not in TasksManager._SUPPORTED_MODEL_TYPE @@ -698,11 +696,12 @@ def export_from_model( else: # save the subcomponent configuration for model_name in models_and_export_configs: + target_dir = output / model_name subcomponent = models_and_export_configs[model_name][0] if hasattr(subcomponent, "save_config"): - subcomponent.save_config(output / model_name) + subcomponent.save_config(target_dir) elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): - subcomponent.config.save_pretrained(output / model_name) + subcomponent.config.save_pretrained(target_dir) files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_export_configs] @@ -913,6 +912,61 @@ def _get_multi_modal_submodels_and_export_configs( return main_config, models_for_export, stateful_parts +def _get_sam2_video_submodels_and_export_configs( + model: "PreTrainedModel", + task: str, + library_name: str, + int_dtype: str, + float_dtype: str, + preprocessors: Optional[List[Any]] = None, + exporter: str = "openvino", +): + models_for_export: Dict[str, Tuple["PreTrainedModel", "OnnxConfig"]] = {} + + def _component_export_name(name: str) -> str: + if name.startswith("sam2video_"): + return name[len("sam2video_"):] + if name.startswith("sam2_"): + return name[len("sam2_"):] + return name + + normalized_task = task or "" + if normalized_task.startswith("feature-extraction"): + component_specs: List[Tuple[str, str]] = [ + ("sam2video_vision_encoder", "feature-extraction"), + ("sam2video_prompt_encoder", "feature-extraction"), + ] + elif normalized_task.startswith("image-segmentation"): + component_specs = [("sam2video_mask_decoder", "image-segmentation")] + else: + component_specs = [ + ("sam2video_vision_encoder", "feature-extraction"), + ("sam2video_prompt_encoder", "feature-extraction"), + ("sam2video_mask_decoder", "image-segmentation"), + ] + + for component_model_type, component_task in component_specs: + config_constructor = TasksManager.get_exporter_config_constructor( + model=model, + exporter=exporter, + library_name=library_name, + task=component_task, + model_type=component_model_type, + ) + export_config = config_constructor( + model.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + export_name = _component_export_name(component_model_type) + models_for_export[export_name] = (model, export_config) + + export_config = next(iter(models_for_export.values()))[1] if models_for_export else None + stateful_parts = [False] * len(models_for_export) + return export_config, models_for_export, stateful_parts + + def _get_submodels_and_export_configs( model: Union["PreTrainedModel", "DiffusionPipeline"], task: str, @@ -937,6 +991,20 @@ def _get_submodels_and_export_configs( return _get_multi_modal_submodels_and_export_configs( model, task, library_name, int_dtype, float_dtype, preprocessors, model_kwargs, stateful ) + elif ( + not custom_architecture + and library_name == "transformers" + and getattr(model.config, "model_type", None) == "sam2_video" + ): + return _get_sam2_video_submodels_and_export_configs( + model, + task, + library_name, + int_dtype, + float_dtype, + preprocessors, + exporter=exporter, + ) elif not custom_architecture and library_name == "transformers" and model.config.model_type == "speecht5": return _get_speecht5_tss_model_for_export( model, task, library_name, int_dtype, float_dtype, preprocessors, model_kwargs diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 949bbc2fab..9eb86975c5 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -139,6 +139,9 @@ Qwen2VLVisionEmbMergerPatcher, Qwen3MoeModelPatcher, QwenModelPatcher, + Sam2VideoMaskDecoderPatcher, + Sam2VideoPromptEncoderPatcher, + Sam2VideoVisionEncoderPatcher, SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, @@ -203,6 +206,34 @@ def init_model_configs(): "transformers", "AutoModelForImageTextToText", ) + TasksManager._CUSTOM_CLASSES[("pt", "sam2video_vision_encoder", "feature-extraction")] = ( + "transformers", + "Sam2VideoModel", + ) + TasksManager._CUSTOM_CLASSES[("pt", "sam2video_prompt_encoder", "feature-extraction")] = ( + "transformers", + "Sam2VideoModel", + ) + TasksManager._CUSTOM_CLASSES[("pt", "sam2video_mask_decoder", "image-segmentation")] = ( + "transformers", + "Sam2VideoModel", + ) + + if "transformers" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: + TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["transformers"] = {} + + sam2_video_model_tasks = { + "sam2video_vision_encoder": "feature-extraction", + "sam2video_prompt_encoder": "feature-extraction", + "sam2video_mask_decoder": "image-segmentation", + } + + for model_type, task in sam2_video_model_tasks.items(): + library_supported_models = TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["transformers"] + model_entry = library_supported_models.setdefault(model_type, {}) + model_entry.setdefault("openvino", {}) + TasksManager._SUPPORTED_MODEL_TYPE.setdefault(model_type, {}) + TasksManager._SUPPORTED_MODEL_TYPE[model_type].setdefault("openvino", {}) if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" @@ -4295,6 +4326,147 @@ class VisionEncoderDecoderOpenVINOConfig(VisionEncoderDecoderOnnxConfig): _MODEL_PATCHER = OVSeq2SeqModelPatcher +@register_in_tasks_manager("sam2video_vision_encoder", "feature-extraction", library_name="transformers") +class Sam2VideoVisionEncoderOpenVINOConfig(VisionOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + _MODEL_PATCHER = Sam2VideoVisionEncoderPatcher + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "pixel_values": { + 0: "video_batch_size", + 1: "num_frames", + 2: "num_channels", + 3: "height", + 4: "width", + } + } + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + pixel_values = kwargs.pop("pixel_values", None) + num_frames = kwargs.pop("num_frames", getattr(self._config, "num_frames", 1)) or 1 + dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) + pixel_values = pixel_values if pixel_values is not None else dummy_inputs["pixel_values"] + + # SAM2 backbone expects NHWC input; if the tensor is 5D fall back to a default batch of frames. + if pixel_values.ndim == 5: + if framework == "pt": + import torch # type: ignore + + pixel_values = pixel_values[:, 0].contiguous() + else: + import numpy as np # type: ignore + + pixel_values = np.ascontiguousarray(pixel_values[:, 0]) + + expected_frames = getattr(self._config, "num_frames", 1) or 1 + if num_frames != expected_frames: + num_frames = expected_frames + + if framework == "pt": + import torch + + if pixel_values.dim() == 4: + pixel_values = pixel_values.unsqueeze(1) + if num_frames != pixel_values.shape[1]: + pixel_values = pixel_values.repeat(1, num_frames, 1, 1, 1) + pixel_values = pixel_values.contiguous() + else: + import numpy as np # type: ignore + + if pixel_values.ndim == 4: + pixel_values = np.expand_dims(pixel_values, axis=1) + if num_frames != pixel_values.shape[1]: + pixel_values = np.repeat(pixel_values, num_frames, axis=1) + + dummy_inputs["pixel_values"] = pixel_values + return dummy_inputs + + +@register_in_tasks_manager("sam2video_prompt_encoder", "feature-extraction", library_name="transformers") +class Sam2VideoPromptEncoderOpenVINOConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedConfig + DUMMY_INPUT_GENERATOR_CLASSES: Tuple[DummyInputGenerator, ...] = () + _MODEL_PATCHER = Sam2VideoPromptEncoderPatcher + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_points": {0: "batch_size", 1: "point_batch_size", 2: "num_points", 3: "coords"}, + "input_labels": {0: "batch_size", 1: "point_batch_size", 2: "num_points"}, + "input_boxes": {0: "batch_size", 1: "point_batch_size", 2: "coords"}, + "input_masks": {0: "batch_size", 1: "mask_channels", 2: "mask_height", 3: "mask_width"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "sparse_embeddings": { + 0: "batch_size", + 1: "point_batch_size", + 2: "num_prompt_tokens", + 3: "hidden_size", + }, + "dense_embeddings": {0: "batch_size", 1: "hidden_size", 2: "mask_height", 3: "mask_width"}, + } + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + batch_size = max(1, kwargs.pop("batch_size", DEFAULT_DUMMY_SHAPES["batch_size"])) + point_batch_size = max(1, kwargs.pop("point_batch_size", 1)) + num_points = max(1, kwargs.pop("num_points", 1)) + num_boxes = max(1, kwargs.pop("num_boxes", point_batch_size)) + + prompt_config = getattr(self._config, "prompt_encoder_config", self._config) + image_size = getattr(prompt_config, "image_size", getattr(self._config, "image_size", 256)) + patch_size = getattr(prompt_config, "patch_size", getattr(self._config, "patch_size", 16)) + mask_height = kwargs.pop("mask_height", max(1, 4 * image_size // patch_size)) + mask_width = kwargs.pop("mask_width", max(1, 4 * image_size // patch_size)) + + if framework == "pt": + import torch # type: ignore + + float_dtype = torch.float32 + long_dtype = torch.long + + input_points = torch.zeros( + (batch_size, point_batch_size, num_points, 2), dtype=float_dtype + ) + input_labels = torch.full( + (batch_size, point_batch_size, num_points), fill_value=-1, dtype=long_dtype + ) + input_boxes = torch.zeros((batch_size, num_boxes, 4), dtype=float_dtype) + input_masks = torch.zeros((batch_size, 1, mask_height, mask_width), dtype=float_dtype) + else: + import numpy as np # type: ignore + + float_dtype = np.float32 + int_dtype = np.int64 + + input_points = np.zeros((batch_size, point_batch_size, num_points, 2), dtype=float_dtype) + input_labels = np.full((batch_size, point_batch_size, num_points), fill_value=-1, dtype=int_dtype) + input_boxes = np.zeros((batch_size, num_boxes, 4), dtype=float_dtype) + input_masks = np.zeros((batch_size, 1, mask_height, mask_width), dtype=float_dtype) + + dummy_inputs = { + "input_points": input_points, + "input_labels": input_labels, + "input_boxes": input_boxes, + "input_masks": input_masks, + } + + return dummy_inputs + + +@register_in_tasks_manager("sam2video_mask_decoder", "image-segmentation", library_name="transformers") +class Sam2VideoMaskDecoderOpenVINOConfig(VisionOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + _MODEL_PATCHER = Sam2VideoMaskDecoderPatcher + + class Zamba2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): """ Generates dummy cache_params inputs for Zamba2 architectures. diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7aeb9a2d95..42a5efb355 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,6 +18,7 @@ import logging as log import math import types +from collections import OrderedDict from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -42,6 +43,12 @@ from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock +try: + from transformers.models.sam2_video.modeling_sam2_video import Sam2VideoImageSegmentationOutput +except Exception: # pragma: no cover - optional dependency + Sam2VideoImageSegmentationOutput = None + + if TYPE_CHECKING: from transformers.cache_utils import Cache from transformers.modeling_utils import PreTrainedModel @@ -3275,6 +3282,138 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward +def sam2_video_vision_encoder_forward(self, pixel_values: torch.Tensor, **kwargs): + """Forward pass that exposes FPN features formatted for export.""" + + if pixel_values.dim() == 5: + pixel_values = pixel_values.flatten(0, 1) + + feature_maps, feature_maps_position_embeddings, vision_hidden_states, vision_attentions = self.get_image_features( + pixel_values, **kwargs + ) + + def _coerce_sequence(items: Optional[Union[List[torch.Tensor], Tuple[torch.Tensor, ...], torch.Tensor]]) -> Optional[Union[Tuple[torch.Tensor, ...], torch.Tensor]]: + if items is None: + return None + if isinstance(items, torch.Tensor): + return items + return tuple(items) + + feature_maps_tuple = _coerce_sequence(feature_maps) + position_embeddings_tuple = _coerce_sequence(feature_maps_position_embeddings) + vision_hidden_states_tuple = _coerce_sequence(vision_hidden_states) + vision_attentions_tuple = _coerce_sequence(vision_attentions) + + if isinstance(feature_maps_tuple, tuple): + last_hidden_state = feature_maps_tuple[-1] + else: + last_hidden_state = feature_maps_tuple + + outputs = OrderedDict() + outputs["last_hidden_state"] = last_hidden_state + outputs["hidden_states"] = feature_maps_tuple + outputs["attentions"] = position_embeddings_tuple + outputs["vision_hidden_states"] = vision_hidden_states_tuple + outputs["vision_attentions"] = vision_attentions_tuple + return outputs + + +def sam2_video_prompt_encoder_forward( + self, + input_points: Optional[torch.Tensor] = None, + input_labels: Optional[torch.Tensor] = None, + input_boxes: Optional[torch.Tensor] = None, + input_masks: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Expose prompt encoder outputs in a deterministic tuple.""" + + sparse_embeddings, dense_embeddings = self.get_prompt_embeddings( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + return sparse_embeddings, dense_embeddings + + +def sam2_video_mask_decoder_forward( + self, + image_embeddings: torch.Tensor, + image_positional_embeddings: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool = True, + high_resolution_features: Optional[List[torch.Tensor]] = None, + attention_similarity: Optional[torch.Tensor] = None, + target_embedding: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward that routes inputs directly to the mask decoder component.""" + + masks, iou_scores, sam_tokens, object_score_logits = self.mask_decoder( + image_embeddings=image_embeddings, + image_positional_embeddings=image_positional_embeddings, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + multimask_output=multimask_output, + high_resolution_features=high_resolution_features, + attention_similarity=attention_similarity, + target_embedding=target_embedding, + ) + + return masks, iou_scores, sam_tokens, object_score_logits + + +class Sam2VideoVisionEncoderPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(sam2_video_vision_encoder_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Sam2VideoPromptEncoderPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(sam2_video_prompt_encoder_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Sam2VideoMaskDecoderPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(sam2_video_mask_decoder_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor: def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: assert dim % 2 == 0, "The dimension must be even." diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 8616cc11c0..754f5ea189 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -49,6 +49,18 @@ _MAX_UNCOMPRESSED_SIZE = 1e9 +def resolve_model_type(config: PretrainedConfig, task: Optional[str] = None) -> str: + export_model_map = getattr(config, "export_model_type_map", None) + if task is not None and isinstance(export_model_map, dict): + export_model_type = export_model_map.get(task) + if export_model_type is not None: + return export_model_type + export_model_type = getattr(config, "export_model_type", None) + if export_model_type is not None: + return export_model_type + return getattr(config, "model_type", None) or "" + + def is_torch_model(model: Union["PreTrainedModel", "ModelMixin"]): """ Checks whether the model is a torch model. @@ -298,10 +310,7 @@ def save_preprocessors( preprocessors: List, config: PretrainedConfig, output: Union[str, Path], trust_remote_code: bool ): model_name_or_path = config._name_or_path - if hasattr(config, "export_model_type"): - model_type = config.export_model_type - else: - model_type = config.model_type + model_type = resolve_model_type(config) if preprocessors is not None: # phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk if model_type == "phi3_v" and len(preprocessors) > 1: diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 141b055f96..deff82f520 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -16,6 +16,8 @@ from transformers.utils import OptionalDependencyNotAvailable, _LazyModule +from . import _compat_transformers # noqa: F401 Ensures legacy architectures get registered. + from .utils import ( is_diffusers_available, is_ipex_available, diff --git a/optimum/intel/_compat_transformers.py b/optimum/intel/_compat_transformers.py new file mode 100644 index 0000000000..5c332ae415 --- /dev/null +++ b/optimum/intel/_compat_transformers.py @@ -0,0 +1,64 @@ +"""Compatibility helpers for older Transformers releases.""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +def _register_auto_config(name: str, config_cls) -> None: + if config_cls is None: + return + + try: + from transformers import AutoConfig + AutoConfig.register(name, config_cls) # type: ignore[attr-defined] + except Exception as exc: # pragma: no cover - Transformers API variations + try: + from transformers.models.auto.configuration_auto import CONFIG_MAPPING + + if hasattr(CONFIG_MAPPING, "register"): + CONFIG_MAPPING.register(name, config_cls) # type: ignore[attr-defined] + else: + mapping = getattr(CONFIG_MAPPING, "_extra_content", None) + if isinstance(mapping, dict): + mapping.setdefault(name, config_cls) + except Exception: + logger.debug("Failed to register %s with AutoConfig: %s", name, exc) + return + else: + logger.debug("Registered %s with AutoConfig", name) + + +def ensure_sam2_video_registered() -> None: + try: + from transformers import AutoConfig + + if hasattr(AutoConfig, "register"): + try: + AutoConfig.get_config("sam2_video") # type: ignore[attr-defined] + return + except Exception: + pass + except Exception: + AutoConfig = None # type: ignore + + try: + from transformers.models.sam2_video.configuration_sam2_video import Sam2VideoConfig # type: ignore + except Exception: + Sam2VideoConfig = None + + if AutoConfig is not None: + try: + AutoConfig.register("sam2_video", Sam2VideoConfig) # type: ignore[attr-defined] + return + except Exception: + pass + + _register_auto_config("sam2_video", Sam2VideoConfig) + + +ensure_sam2_video_registered() + +__all__ = ["ensure_sam2_video_registered"] diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index dd63a40762..4442a915dd 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -32,6 +32,23 @@ OV_XML_FILE_NAME, ) +try: + from transformers import utils as _transformers_utils +except Exception: # pragma: no cover - defensive fallback + _transformers_utils = None +else: + try: + _ = _transformers_utils.TRANSFORMERS_CACHE + except AttributeError: + try: + from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE as _hf_cache_path + except Exception: + import os + + _hf_cache_path = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub") + + _transformers_utils.TRANSFORMERS_CACHE = _hf_cache_path + warnings.simplefilter(action="ignore", category=FutureWarning) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 65f1b770fd..d0feb80f2f 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -172,6 +172,7 @@ class OVQuantizationMethod(str, Enum): }, "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, + "facebook/sam2.1-hiera-small": {"bits": 4, "sym": False, "group_size": 32, "ratio": 1.0}, "togethercomputer/RedPajama-INCITE-Chat-3B-v1": { "bits": 4, "sym": False, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index be89cf8ce2..6353a59db6 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -26,8 +26,16 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import GenerationMixin -from transformers.utils import is_offline_mode from transformers.utils.hub import cached_file +try: # transformers >= 4.46 + from transformers.utils import is_offline_mode +except ImportError: # transformers nightly dropped the re-export + try: + from huggingface_hub.utils import is_offline_mode # type: ignore + except ImportError: + def is_offline_mode() -> bool: # fallback for very old hub versions + value = os.environ.get("TRANSFORMERS_OFFLINE", "0") + return value.strip().lower() in {"1", "true", "yes"} from optimum.exporters.base import ExportConfig from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index a7699a5f4f..6fc799815f 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -50,7 +50,14 @@ from huggingface_hub.utils import validate_hf_hub_args from openvino import Core from openvino._offline_transformations import compress_model_transformation -from transformers import CLIPFeatureExtractor, CLIPTokenizer + +try: + from transformers import CLIPFeatureExtractor, CLIPTokenizer +except ImportError: + from transformers import CLIPImageProcessor, CLIPTokenizer + + CLIPFeatureExtractor = CLIPImageProcessor # type: ignore + from transformers.modeling_outputs import ModelOutput from transformers.utils import http_user_agent diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index a3536fd8da..dbf2e06fb9 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -31,7 +31,15 @@ from transformers.file_utils import add_start_docstrings from transformers.modeling_outputs import ModelOutput from transformers.models.clip.modeling_clip import CLIPOutput -from transformers.utils import is_offline_mode +try: # transformers >= 4.46 + from transformers.utils import is_offline_mode +except ImportError: # transformers nightly dropped the re-export + try: + from huggingface_hub.utils import is_offline_mode # type: ignore + except ImportError: + def is_offline_mode() -> bool: # fallback for very old hub versions + value = os.environ.get("TRANSFORMERS_OFFLINE", "0") + return value.strip().lower() in {"1", "true", "yes"} from optimum.exporters.tasks import TasksManager diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 46b98f6ab1..ac62ce1b79 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -13,6 +13,7 @@ # limitations under the License. +import enum import json import logging import os @@ -32,7 +33,21 @@ from openvino import Type as OVType from packaging.version import Version from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size + +try: + from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size +except ImportError: + try: + from transformers.utils.onnx import ParameterFormat, compute_serialized_parameters_size + except ImportError: + + class ParameterFormat(enum.Enum): # type: ignore + Float = "float32" + + def compute_serialized_parameters_size(num_parameters: int, parameter_format: ParameterFormat) -> int: + if parameter_format == ParameterFormat.Float: + return num_parameters * 4 + raise ValueError("Unsupported parameter format in fallback implementation.") from optimum.intel.utils.import_utils import is_torch_version diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..fcf9dabb82 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -23,7 +23,55 @@ from typing import Dict, List, Optional, Type, Union import torch -from huggingface_hub import HfApi, HfFolder, hf_hub_download + +try: + from huggingface_hub import HfApi, HfFolder, hf_hub_download +except ImportError: + # huggingface_hub>=0.26 stops re-exporting HfFolder from the package root + from huggingface_hub import HfApi, hf_hub_download + + _get_token_fn = None + try: + from huggingface_hub import get_token as _get_token_fn # type: ignore + except ImportError: + try: + from huggingface_hub._auth import get_token as _get_token_fn # type: ignore + except ImportError: + _get_token_fn = None + + if _get_token_fn is None: + try: + from huggingface_hub.utils import HF_TOKEN_PATH # type: ignore + except ImportError: + HF_TOKEN_PATH = None # type: ignore + + def _read_token_from_fs() -> Optional[str]: + if HF_TOKEN_PATH is None: + return None + try: + token_path = Path(HF_TOKEN_PATH) + if token_path.exists(): + return token_path.read_text(encoding="utf-8").strip() or None + except Exception: + return None + return None + + def _get_token_wrapper() -> Optional[str]: + return _read_token_from_fs() + else: + + def _get_token_wrapper() -> Optional[str]: + try: + return _get_token_fn() + except Exception: + return None + + class HfFolder: # type: ignore + """Minimal token reader compatible with legacy HfFolder API.""" + + def get_token(self) -> Optional[str]: + return _get_token_wrapper() + from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.hf_api import file_exists from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel diff --git a/setup.py b/setup.py index e54d8104ee..1e1d015336 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx==0.0.*", - "transformers>=4.45,<4.56", + "transformers>=4.56.2,<4.57", "setuptools", ] @@ -66,8 +66,8 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], - "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], - "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], + "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "datasets"], + "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>=4.56.2,<4.57", "accelerate"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/tests/openvino/test_sam2.py b/tests/openvino/test_sam2.py new file mode 100644 index 0000000000..98fd3d9da4 --- /dev/null +++ b/tests/openvino/test_sam2.py @@ -0,0 +1,71 @@ +import unittest +from types import SimpleNamespace + +from functools import partial + +from optimum.exporters.openvino.model_configs import ( + Sam2VideoMaskDecoderOpenVINOConfig, + Sam2VideoPromptEncoderOpenVINOConfig, + Sam2VideoVisionEncoderOpenVINOConfig, +) +from optimum.exporters.tasks import TasksManager # type: ignore[attr-defined] +from optimum.intel.openvino.configuration import _DEFAULT_4BIT_WQ_CONFIGS + + +class _DummySam2VideoModel: + def __init__(self): + self.config = SimpleNamespace(model_type="sam2_video") + + +class Sam2VideoRegistrationTest(unittest.TestCase): + def setUp(self): + self.model = _DummySam2VideoModel() + + def test_vision_encoder_config_registered(self): + ctor = TasksManager.get_exporter_config_constructor( + model=self.model, + exporter="openvino", + library_name="transformers", + task="feature-extraction", + model_type="sam2video_vision_encoder", + ) + if isinstance(ctor, partial): + self.assertIs(ctor.func, Sam2VideoVisionEncoderOpenVINOConfig) + else: + self.assertIs(ctor, Sam2VideoVisionEncoderOpenVINOConfig) + + def test_prompt_encoder_config_registered(self): + ctor = TasksManager.get_exporter_config_constructor( + model=self.model, + exporter="openvino", + library_name="transformers", + task="feature-extraction", + model_type="sam2video_prompt_encoder", + ) + if isinstance(ctor, partial): + self.assertIs(ctor.func, Sam2VideoPromptEncoderOpenVINOConfig) + else: + self.assertIs(ctor, Sam2VideoPromptEncoderOpenVINOConfig) + + def test_mask_decoder_config_registered(self): + ctor = TasksManager.get_exporter_config_constructor( + model=self.model, + exporter="openvino", + library_name="transformers", + task="image-segmentation", + model_type="sam2video_mask_decoder", + ) + if isinstance(ctor, partial): + self.assertIs(ctor.func, Sam2VideoMaskDecoderOpenVINOConfig) + else: + self.assertIs(ctor, Sam2VideoMaskDecoderOpenVINOConfig) + + +class Sam2QuantDefaultsTest(unittest.TestCase): + def test_default_4bit_quant_config_registered(self): + self.assertIn("facebook/sam2.1-hiera-small", _DEFAULT_4BIT_WQ_CONFIGS) + config = _DEFAULT_4BIT_WQ_CONFIGS["facebook/sam2.1-hiera-small"] + self.assertEqual(config.get("bits"), 4) + self.assertEqual(config.get("group_size"), 32) + self.assertFalse(config.get("sym")) + self.assertEqual(config.get("ratio"), 1.0)