diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index df2885fd09..d7e27cd94c 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -71,6 +71,7 @@ _get_open_clip_submodels_fn_and_export_configs, clear_class_registry, remove_none_from_dummy_inputs, + save_config, ) @@ -659,7 +660,11 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + except KeyError: + misplaced_generation_parameters = {} if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " @@ -671,7 +676,7 @@ def export_from_model( setattr(model.config, param_name, None) # Saving the model config and preprocessor as this is needed sometimes. - model.config.save_pretrained(output) + save_config(model.config, output) generation_config = getattr(model, "generation_config", None) if generation_config is not None: try: diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5276ade33b..63ee5c5950 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1444,7 +1444,7 @@ class InternVLChatConfigBehavior(str, enum.Enum): @register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(OnnxConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior] - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) def __init__( diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 9286a37f78..7013342099 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -13,7 +13,9 @@ # limitations under the License. import inspect +import logging from collections import namedtuple +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union from transformers.utils import is_torch_available @@ -25,6 +27,9 @@ from optimum.utils import is_diffusers_available +logger = logging.getLogger(__name__) + + InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"]) @@ -209,3 +214,16 @@ def get_submodels(model): MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"] + + +def save_config(config, save_dir): + try: + config.save_pretrained(save_dir) + except Exception as exp: + logger.warning( + f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage." + ) + save_dir = Path(save_dir) + save_dir.mkdir(exist_ok=True, parents=True) + output_config_file = Path(save_dir / "config.json") + config.to_json_file(output_config_file, use_diff=True) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index ed3cdadb51..320d77c4ca 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -136,7 +136,11 @@ def __init__( self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except KeyError: + misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 06c6011488..0ce15641fe 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -84,7 +84,11 @@ def __init__( self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except KeyError: + misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8f72a73535..b7bf96a0de 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -15,6 +15,7 @@ from PIL.Image import Image from transformers import ( AutoConfig, + AutoImageProcessor, GenerationConfig, GenerationMixin, PretrainedConfig, @@ -24,6 +25,7 @@ from ...exporters.openvino import main_export from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name +from ...exporters.openvino.utils import save_config from .. import OVQuantizer from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart @@ -319,6 +321,13 @@ def compile(self): if part_model is not None: part_model._compile() + def _save_config(self, save_directory): + """ + Saves a model configuration into a directory, so that it can be re-loaded using the + [`from_pretrained`] class method. + """ + save_config(self.config, save_directory) + def _save_pretrained(self, save_directory: Union[str, Path]): """ Saves the model to the OpenVINO IR format so that it can be re-loaded using the @@ -728,9 +737,9 @@ def can_generate(self): @staticmethod @abstractmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): """ @@ -902,15 +911,23 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values): @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if image is None: - raise ValueError("Image is required.") - chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}] - prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True) + if processor is None: + raise ValueError("Processor is required.") + if getattr(processor, "chat_template", None) is not None: + chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}] + if image is not None: + chat_prompt[0]["content"].append({"type": "image"}) + prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False) + else: + if image is not None and "" not in text: + prompt = "\n" + text + else: + prompt = text inputs = processor(images=image, text=prompt, return_tensors="pt") return inputs @@ -1209,6 +1226,159 @@ def merge_vision_text_embeddings( input_embeds = input_embeds.reshape(B, N, C) return input_embeds, attention_mask, position_ids + def preprocess_inputs( + self, + text: str, + image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + ): + if tokenizer is None: + raise ValueError("Tokenizer is required.") + import torchvision.transforms as T + from torchvision.transforms.functional import InterpolationMode + + IMG_START_TOKEN = "" + IMG_END_TOKEN = "" + IMG_CONTEXT_TOKEN = "" + + IMAGENET_MEAN = (0.485, 0.456, 0.406) + IMAGENET_STD = (0.229, 0.224, 0.225) + + def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose( + [ + T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD), + ] + ) + return transform + + def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = { + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + } + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def load_image(image, input_size=448, max_num=12): + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + if image is not None: + if "" not in text: + text = "\n" + text + pixel_values = load_image(image, input_size=self.config.vision_config.image_size) + num_patches = pixel_values.shape[0] + num_image_token = int( + (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2 + * (self.config.downsample_ratio**2) + ) + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN + text = text.replace("", image_tokens, 1) + text_inputs = tokenizer(text, return_tensors="pt") + inputs = dict(text_inputs) + inputs.update({"pixel_values": pixel_values}) + else: + inputs = tokenizer(text, return_tensors="pt") + return inputs + + # internvl has issue with check _get_non_default_parameters, as wrkaraund overide _prepare_generation_config + def _prepare_generation_config( + self, generation_config: Optional[GenerationConfig], **kwargs: Dict + ) -> Tuple[GenerationConfig, Dict]: + using_model_generation_config = False + if generation_config is None: + if ( + self.generation_config._from_model_config # 1) + and self.generation_config._original_object_hash == hash(self.generation_config) # 2) + ): + new_generation_config = GenerationConfig.from_model_config(self.config) + if new_generation_config != self.generation_config: # 4) + warnings.warn( + "You have modified the pretrained model configuration to control generation. This is a" + " deprecated strategy to control generation and will be removed in v5." + " Please use and modify the model generation configuration (see" + " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )", + UserWarning, + ) + self.generation_config = new_generation_config + + generation_config = self.generation_config + using_model_generation_config = True + + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model + if not using_model_generation_config: + if generation_config.bos_token_id is None: + generation_config.bos_token_id = self.generation_config.bos_token_id + if generation_config.eos_token_id is None: + generation_config.eos_token_id = self.generation_config.eos_token_id + if generation_config.pad_token_id is None: + generation_config.pad_token_id = self.generation_config.pad_token_id + if generation_config.decoder_start_token_id is None: + generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id + + return generation_config, model_kwargs + class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM): additional_parts = ["resampler"] @@ -1430,14 +1600,22 @@ def merge_vision_text_embeddings( @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if image is None: - raise ValueError("Image is required.") - prompt = f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n" + if processor is None: + raise ValueError("Processor is required.") + if getattr(processor, "chat_template", None) is not None: + messages = [{"role": "user", "content": text if image is None else "(./)\n" + text}] + prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + else: + prompt = ( + f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n" + if image is not None + else text + ) inputs = processor([prompt], [image], return_tensors="pt") return inputs @@ -1615,17 +1793,24 @@ def get_multimodal_embeddings( @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): if tokenizer is None: raise ValueError("Tokenizer is required.") - messages = [{"role": "user", "content": f"\n{text}"}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")] - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + if image is not None and processor is None: + raise ValueError("Processor is required.") + text_content = f"\n{text}" if image is not None else text + messages = [{"role": "user", "content": text_content}] + if tokenizer.chat_template is not None: + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + if image is not None: + text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")] + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + else: + input_ids = tokenizer(text, return_tensors="pt").input_ids attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} if image is not None: diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 06cc16d043..a84b3e8f4f 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -784,7 +784,9 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig): image = Image.open(requests.get(image_url, stream=True).raw) try: - inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer) + inputs = self.model.preprocess_inputs( + text=instruction, image=image, processor=processor, tokenizer=tokenizer + ) except ValueError as value_error: if "Tokenizer is required." in str(value_error) and tokenizer_error is not None: raise tokenizer_error diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 916833602d..be4efee4af 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1883,9 +1883,9 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv"] - REMOTE_CODE_MODELS = ["minicpmv", "nanollava"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2"] TASK = "image-text-to-text" + REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"] IMAGE = Image.open( requests.get( @@ -1905,45 +1905,25 @@ def get_transformer_model_class(self, model_arch): return LlavaNextForConditionalGeneration return AutoModelForCausalLM - def gen_inputs(self, model_arch, base_text_prompt, image=None): - model_id = MODEL_NAMES[model_arch] - if "llava" in model_arch: - prompt = f"\n {base_text_prompt}" - elif "minicpmv" in model_arch: - prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" - if model_arch != "nanollava": - processor = AutoProcessor.from_pretrained( - model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt") - else: - config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - processor = AutoProcessor.from_pretrained( - config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - tokenizer = AutoTokenizer.from_pretrained( - model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - image_input = None - if image is not None: - image_input = processor(images=image, return_tensors="pt")["pixel_values"] - text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] - - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) - attention_mask = torch.ones_like(input_ids, dtype=torch.int64) - inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} - return inputs - @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): + prompt = "What is shown in this image?" model_id = MODEL_NAMES[model_arch] + set_seed(SEED) transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) + transformers_model.eval() + if "internvl2" in model_arch: + tokenizer = AutoTokenizer.from_pretrained( + model_id, trast_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + img_context_token_id = tokenizer.convert_tokens_to_ids("") + transformers_model.img_context_token_id = img_context_token_id if "nanollava" in model_arch: transformers_model.get_vision_tower().load_model() - inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) - + preprocessors = self.get_preprocessors(model_arch) + set_seed(SEED) ov_model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -1954,13 +1934,18 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(hasattr(ov_model, additional_part)) self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) - # pytorch minicpmv is not designed to be used via forward - if "minicpmv" not in model_arch: + inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600))) + # pytorch minicpmv and internvl are not designed to be used via forward + if model_arch not in ["minicpmv", "internvl2"]: + set_seed(SEED) + ov_outputs = ov_model(**inputs) set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) - ov_outputs = ov_model(**inputs) - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue( + torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4), + f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", + ) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -1975,9 +1960,11 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) - transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) - # original minicpmv always skip input tokens in generation results, while transformers based approach provide them - if model_arch == "minicpmv": + with torch.no_grad(): + transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) + + # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them + if model_arch in ["minicpmv", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), @@ -2050,18 +2037,19 @@ def test_generate_utils(self, model_arch): model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) + question = "Describe image" + preprocessors = self.get_preprocessors(model_arch) + inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600))) # General case outputs = model.generate(**inputs, max_new_tokens=10) - # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 - outputs = outputs[:, inputs["input_ids"].shape[1] :] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True) self.assertIsInstance(outputs[0], str) # No input image case question = "Hi, how are you?" - inputs = self.gen_inputs(model_arch, question, None) + inputs = model.preprocess_inputs(**preprocessors, text=question, image=None) outputs = model.generate(**inputs, max_new_tokens=10) # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 outputs = outputs[:, inputs["input_ids"].shape[1] :] @@ -2071,6 +2059,29 @@ def test_generate_utils(self, model_arch): gc.collect() + def get_preprocessors(self, model_arch): + model_id = MODEL_NAMES[model_arch] + if model_arch == "nanollava": + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + processor = AutoProcessor.from_pretrained( + config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + preprocessors = {"processor": processor, "tokenizer": tokenizer} + elif model_arch == "internvl2": + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + preprocessors = {"processor": None, "tokenizer": tokenizer} + else: + processor = AutoProcessor.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + preprocessors = {"processor": processor, "tokenizer": None} + return preprocessors + class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("whisper",) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fc3d97e243..313120833f 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -71,6 +71,7 @@ "ibert": "hf-internal-testing/tiny-random-ibert", "internlm": "katuni4ka/tiny-random-internlm", "internlm2": "katuni4ka/tiny-random-internlm2", + "internvl2": "katuni4ka/tiny-random-internvl2", "jais": "katuni4ka/tiny-random-jais", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5",