From 0b046eedd68a8befeb37080c52a48da14cbde25d Mon Sep 17 00:00:00 2001 From: eaidova Date: Thu, 24 Oct 2024 14:23:37 +0400 Subject: [PATCH 01/13] fix config saving when check on misplaced args broken --- optimum/exporters/openvino/convert.py | 9 +++++++-- optimum/exporters/openvino/utils.py | 11 +++++++++++ optimum/intel/openvino/modeling_base.py | 6 +++++- optimum/intel/openvino/modeling_base_seq2seq.py | 6 +++++- optimum/intel/openvino/modeling_visual_language.py | 8 ++++++++ 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index df2885fd09..cf0c8be117 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -71,6 +71,7 @@ _get_open_clip_submodels_fn_and_export_configs, clear_class_registry, remove_none_from_dummy_inputs, + save_config, ) @@ -659,7 +660,11 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + except Exception: + misplaced_generation_parameters = {} if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " @@ -671,7 +676,7 @@ def export_from_model( setattr(model.config, param_name, None) # Saving the model config and preprocessor as this is needed sometimes. - model.config.save_pretrained(output) + save_config(model.config, output) generation_config = getattr(model, "generation_config", None) if generation_config is not None: try: diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 9286a37f78..d7c01da9da 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -14,6 +14,7 @@ import inspect from collections import namedtuple +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union from transformers.utils import is_torch_available @@ -209,3 +210,13 @@ def get_submodels(model): MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"] + + +def save_config(config, save_dir): + try: + config.save_pretrained(save_dir) + except Exception: + save_dir = Path(save_dir) + save_dir.mkdir(exist_ok=True) + output_config_file = Path(save_dir / "config.json") + config.to_json_file(output_config_file, use_diff=True) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index ed3cdadb51..5dd0dbc231 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -136,7 +136,11 @@ def __init__( self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except Exception: + misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 06c6011488..20a5afdca7 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -84,7 +84,11 @@ def __init__( self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except Exception: + misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8f72a73535..dafefd8df8 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -25,6 +25,7 @@ from ...exporters.openvino import main_export from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name from .. import OVQuantizer +from ...exporters.openvino.utils import save_config from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -319,6 +320,13 @@ def compile(self): if part_model is not None: part_model._compile() + def _save_config(self, save_directory): + """ + Saves a model configuration into a directory, so that it can be re-loaded using the + [`from_pretrained`] class method. + """ + save_config(self.config, save_directory) + def _save_pretrained(self, save_directory: Union[str, Path]): """ Saves the model to the OpenVINO IR format so that it can be re-loaded using the From 21844a96197a0fc5488efc301d36f9f2d87ab2c0 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 5 Nov 2024 11:51:44 +0400 Subject: [PATCH 02/13] add internvl test --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_modeling.py | 207 ++++++++++++++++---- tests/openvino/utils_tests.py | 1 + 3 files changed, 168 insertions(+), 42 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5276ade33b..63ee5c5950 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1444,7 +1444,7 @@ class InternVLChatConfigBehavior(str, enum.Enum): @register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(OnnxConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior] - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) def __init__( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 916833602d..6af7e88058 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1883,9 +1883,9 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv"] - REMOTE_CODE_MODELS = ["minicpmv", "nanollava"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "nanollava", "internvl2"] TASK = "image-text-to-text" + REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"] IMAGE = Image.open( requests.get( @@ -1905,45 +1905,23 @@ def get_transformer_model_class(self, model_arch): return LlavaNextForConditionalGeneration return AutoModelForCausalLM - def gen_inputs(self, model_arch, base_text_prompt, image=None): - model_id = MODEL_NAMES[model_arch] - if "llava" in model_arch: - prompt = f"\n {base_text_prompt}" - elif "minicpmv" in model_arch: - prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" - if model_arch != "nanollava": - processor = AutoProcessor.from_pretrained( - model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt") - else: - config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - processor = AutoProcessor.from_pretrained( - config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - tokenizer = AutoTokenizer.from_pretrained( - model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS - ) - image_input = None - if image is not None: - image_input = processor(images=image, return_tensors="pt")["pixel_values"] - text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] - - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) - attention_mask = torch.ones_like(input_ids, dtype=torch.int64) - inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} - return inputs - @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): + prompt = "What is shown in this image?" model_id = MODEL_NAMES[model_arch] transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - if "nanollava" in model_arch: - transformers_model.get_vision_tower().load_model() - inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) - + if "internvl2" in model_arch: + tokenizer = AutoTokenizer.from_pretrained( + model_id, trast_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + img_context_token_id = tokenizer.convert_tokens_to_ids("") + transformers_model.img_context_token_id = img_context_token_id + inputs = self.gen_inputs(model_arch, prompt, self.IMAGE) + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs) ov_model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -1959,6 +1937,7 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) + set_seed(SEED) ov_outputs = ov_model(**inputs) self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) @@ -1976,8 +1955,9 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) - # original minicpmv always skip input tokens in generation results, while transformers based approach provide them - if model_arch == "minicpmv": + + # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them + if model_arch in ["minicpmv", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), @@ -2050,13 +2030,13 @@ def test_generate_utils(self, model_arch): model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) + question = "Describe image" + inputs = self.gen_inputs(model_arch, question, self.IMAGE) # General case outputs = model.generate(**inputs, max_new_tokens=10) - # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 - outputs = outputs[:, inputs["input_ids"].shape[1] :] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True) self.assertIsInstance(outputs[0], str) # No input image case @@ -2071,6 +2051,151 @@ def test_generate_utils(self, model_arch): gc.collect() + def gen_inputs(self, model_arch, base_text_prompt, image=None): + model_id = MODEL_NAMES[model_arch] + if "llava" in model_arch: + prompt = f"\n {base_text_prompt}" if image is not None else base_text_prompt + elif "minicpmv" in model_arch: + prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" if image is not None else base_text_prompt + elif "internvl2" in model_arch: + prompt = ( + "<|im_start|>user\n\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" + if image is not None + else base_text_prompt + ) + if model_arch == "nanollava": + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + processor = AutoProcessor.from_pretrained( + config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + image_input = None + if image is not None: + image_input = processor(images=image, return_tensors="pt")["pixel_values"] + text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] + + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids, dtype=torch.int64) + inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} + elif model_arch == "internvl2": + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + inputs = self.internvl_input_transform(config, tokenizer, prompt, image) + else: + processor = AutoProcessor.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + inputs = processor(images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt") + return inputs + + def internvl_input_transform(self, config, tokenizer, prompt, image=None): + import torchvision.transforms as T + from torchvision.transforms.functional import InterpolationMode + + IMG_START_TOKEN = "" + IMG_END_TOKEN = "" + IMG_CONTEXT_TOKEN = "" + + IMAGENET_MEAN = (0.485, 0.456, 0.406) + IMAGENET_STD = (0.229, 0.224, 0.225) + + def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose( + [ + T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD), + ] + ) + return transform + + def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + ) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def load_image(image, input_size=448, max_num=12): + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + if image is not None: + pixel_values = load_image(image, input_size=config.vision_config.image_size) + num_patches = pixel_values.shape[0] + num_image_token = int( + (config.vision_config.image_size // config.vision_config.patch_size) ** 2 + * (config.downsample_ratio**2) + ) + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN + prompt = prompt.replace("", image_tokens, 1) + text_inputs = tokenizer(prompt, return_tensors="pt") + inputs = dict(text_inputs) + inputs.update({"pixel_values": pixel_values}) + else: + inputs = tokenizer(prompt, return_tensors="pt") + return inputs + class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("whisper",) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fc3d97e243..313120833f 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -71,6 +71,7 @@ "ibert": "hf-internal-testing/tiny-random-ibert", "internlm": "katuni4ka/tiny-random-internlm", "internlm2": "katuni4ka/tiny-random-internlm2", + "internvl2": "katuni4ka/tiny-random-internvl2", "jais": "katuni4ka/tiny-random-jais", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", From d6f8d17243b5c4f7cf4bdaea89f8663052363149 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 5 Nov 2024 12:21:20 +0400 Subject: [PATCH 03/13] fix tests --- tests/openvino/test_modeling.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 6af7e88058..1e61aaf16e 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1919,9 +1919,6 @@ def test_compare_to_transformers(self, model_arch): img_context_token_id = tokenizer.convert_tokens_to_ids("") transformers_model.img_context_token_id = img_context_token_id inputs = self.gen_inputs(model_arch, prompt, self.IMAGE) - set_seed(SEED) - with torch.no_grad(): - transformers_outputs = transformers_model(**inputs) ov_model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -1933,13 +1930,13 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) # pytorch minicpmv is not designed to be used via forward - if "minicpmv" not in model_arch: + if model_arch != "minicpmv": set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) set_seed(SEED) ov_outputs = ov_model(**inputs) - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=5e-3)) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -1956,7 +1953,7 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) - # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them + # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them if model_arch in ["minicpmv", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( @@ -2056,7 +2053,11 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): if "llava" in model_arch: prompt = f"\n {base_text_prompt}" if image is not None else base_text_prompt elif "minicpmv" in model_arch: - prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" if image is not None else base_text_prompt + prompt = ( + "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" + if image is not None + else base_text_prompt + ) elif "internvl2" in model_arch: prompt = ( "<|im_start|>user\n\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" @@ -2071,6 +2072,7 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) +<<<<<<< HEAD image_input = None if image is not None: image_input = processor(images=image, return_tensors="pt")["pixel_values"] @@ -2080,6 +2082,12 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): attention_mask = torch.ones_like(input_ids, dtype=torch.int64) inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} elif model_arch == "internvl2": +======= + inputs = processor( + images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt" + ) + else: +>>>>>>> fix tests config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS @@ -2135,13 +2143,13 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnai aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio - target_ratios = set( + target_ratios = { (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num - ) + } target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target From a1838886af007c955885fe30911b90e40bf3298b Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 5 Nov 2024 12:21:20 +0400 Subject: [PATCH 04/13] fix tests --- tests/openvino/test_modeling.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 1e61aaf16e..f7b3f3b86b 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1912,6 +1912,7 @@ def test_compare_to_transformers(self, model_arch): transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) + transformers_model.eval() if "internvl2" in model_arch: tokenizer = AutoTokenizer.from_pretrained( model_id, trast_remote_code=model_arch in self.REMOTE_CODE_MODELS @@ -2072,7 +2073,6 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) -<<<<<<< HEAD image_input = None if image is not None: image_input = processor(images=image, return_tensors="pt")["pixel_values"] @@ -2082,12 +2082,6 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): attention_mask = torch.ones_like(input_ids, dtype=torch.int64) inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} elif model_arch == "internvl2": -======= - inputs = processor( - images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt" - ) - else: ->>>>>>> fix tests config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS From d211fc7860b047f0e9136838cce8252c7505126e Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 5 Nov 2024 16:15:24 +0400 Subject: [PATCH 05/13] numeric stability in tests --- tests/openvino/test_modeling.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f7b3f3b86b..6218de16e3 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1909,6 +1909,7 @@ def get_transformer_model_class(self, model_arch): def test_compare_to_transformers(self, model_arch): prompt = "What is shown in this image?" model_id = MODEL_NAMES[model_arch] + set_seed(SEED) transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -1919,7 +1920,10 @@ def test_compare_to_transformers(self, model_arch): ) img_context_token_id = tokenizer.convert_tokens_to_ids("") transformers_model.img_context_token_id = img_context_token_id + if "nanollava" in model_arch: + transformers_model.get_vision_tower().load_model() inputs = self.gen_inputs(model_arch, prompt, self.IMAGE) + set_seed(SEED) ov_model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) @@ -1930,14 +1934,17 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(hasattr(ov_model, additional_part)) self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) - # pytorch minicpmv is not designed to be used via forward - if model_arch != "minicpmv": + # pytorch minicpmv and internvl are not designed to be used via forward + if not model_arch in ["minicpmv", "internvl2"]: + set_seed(SEED) + ov_outputs = ov_model(**inputs) set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) - set_seed(SEED) - ov_outputs = ov_model(**inputs) - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=5e-3)) + self.assertTrue( + torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4), + f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", + ) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -2076,9 +2083,11 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): image_input = None if image is not None: image_input = processor(images=image, return_tensors="pt")["pixel_values"] - text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] + text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + else: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids attention_mask = torch.ones_like(input_ids, dtype=torch.int64) inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} elif model_arch == "internvl2": @@ -2091,7 +2100,9 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): processor = AutoProcessor.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - inputs = processor(images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt") + inputs = processor( + images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt" + ) return inputs def internvl_input_transform(self, config, tokenizer, prompt, image=None): From cd3b8bd98c8baf709f549baa29980294967d2a00 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 5 Nov 2024 18:15:53 +0400 Subject: [PATCH 06/13] fix code style --- tests/openvino/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 6218de16e3..edbbce5514 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1935,7 +1935,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) # pytorch minicpmv and internvl are not designed to be used via forward - if not model_arch in ["minicpmv", "internvl2"]: + if model_arch not in ["minicpmv", "internvl2"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) From 834500119eb8e878becb28b3b688202d1023f00d Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 11 Nov 2024 20:52:16 +0400 Subject: [PATCH 07/13] update and reuse preprocess_inputs --- optimum/exporters/openvino/convert.py | 2 +- optimum/exporters/openvino/utils.py | 11 +- optimum/intel/openvino/modeling_base.py | 2 +- .../intel/openvino/modeling_base_seq2seq.py | 2 +- .../openvino/modeling_visual_language.py | 155 ++++++++++++++++-- tests/openvino/test_modeling.py | 152 ++--------------- 6 files changed, 168 insertions(+), 156 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index cf0c8be117..d7e27cd94c 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -663,7 +663,7 @@ def export_from_model( # some model configs may have issues with loading without parameters initialization try: misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - except Exception: + except KeyError: misplaced_generation_parameters = {} if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index d7c01da9da..71cfa7db19 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import logging from collections import namedtuple from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -26,6 +27,9 @@ from optimum.utils import is_diffusers_available +logger = logging.getLogger(__name__) + + InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"]) @@ -215,8 +219,11 @@ def get_submodels(model): def save_config(config, save_dir): try: config.save_pretrained(save_dir) - except Exception: + except Exception as exp: + logger.warning( + f"Attempt to save config using standard API is failed with {exp}. It may be issue with model config, please check its correctness before usage." + ) save_dir = Path(save_dir) - save_dir.mkdir(exist_ok=True) + save_dir.mkdir(exist_ok=True, parents=True) output_config_file = Path(save_dir / "config.json") config.to_json_file(output_config_file, use_diff=True) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 5dd0dbc231..320d77c4ca 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -139,7 +139,7 @@ def __init__( # some model configs may have issues with loading without parameters initialization try: misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except Exception: + except KeyError: misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 20a5afdca7..0ce15641fe 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -87,7 +87,7 @@ def __init__( # some model configs may have issues with loading without parameters initialization try: misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except Exception: + except KeyError: misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index dafefd8df8..274c3ee0c8 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -24,8 +24,8 @@ from ...exporters.openvino import main_export from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name -from .. import OVQuantizer from ...exporters.openvino.utils import save_config +from .. import OVQuantizer from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -915,10 +915,16 @@ def preprocess_inputs( image: Optional[Image] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if image is None: - raise ValueError("Image is required.") - chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}] - prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True) + if processor.chat_template is not None: + chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}] + if image is not None: + chat_prompt[0]["content"].append({"type": "image"}) + prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False) + else: + if image is not None and "" not in text: + prompt = "\n" + text + else: + prompt = text inputs = processor(images=image, text=prompt, return_tensors="pt") return inputs @@ -1217,6 +1223,120 @@ def merge_vision_text_embeddings( input_embeds = input_embeds.reshape(B, N, C) return input_embeds, attention_mask, position_ids + def preprocess_inputs( + self, + processor=None, + text: str = "", + image: Optional[Image] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + ): + if tokenizer is None: + raise ValueError("Tokenizer is required.") + import torchvision.transforms as T + from torchvision.transforms.functional import InterpolationMode + + IMG_START_TOKEN = "" + IMG_END_TOKEN = "" + IMG_CONTEXT_TOKEN = "" + + IMAGENET_MEAN = (0.485, 0.456, 0.406) + IMAGENET_STD = (0.229, 0.224, 0.225) + + def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose( + [ + T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD), + ] + ) + return transform + + def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = { + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + } + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def load_image(image, input_size=448, max_num=12): + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + if image is not None: + if "" not in text: + text = "\n" + text + pixel_values = load_image(image, input_size=self.config.vision_config.image_size) + num_patches = pixel_values.shape[0] + num_image_token = int( + (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2 + * (self.config.downsample_ratio**2) + ) + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN + text = text.replace("", image_tokens, 1) + text_inputs = tokenizer(text, return_tensors="pt") + inputs = dict(text_inputs) + inputs.update({"pixel_values": pixel_values}) + else: + inputs = tokenizer(text, return_tensors="pt") + return inputs + class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM): additional_parts = ["resampler"] @@ -1443,9 +1563,15 @@ def preprocess_inputs( image: Optional[Image] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if image is None: - raise ValueError("Image is required.") - prompt = f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n" + if processor.chat_template is not None: + messages = [{"role": "user", "content": text if image is None else "(./)\n" + text}] + prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + else: + prompt = ( + f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n" + if image is not None + else text + ) inputs = processor([prompt], [image], return_tensors="pt") return inputs @@ -1630,10 +1756,15 @@ def preprocess_inputs( ): if tokenizer is None: raise ValueError("Tokenizer is required.") - messages = [{"role": "user", "content": f"\n{text}"}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")] - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + text_content = f"\n{text}" if image is not None else text + messages = [{"role": "user", "content": text_content}] + if tokenizer.chat_template is not None: + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + if image is not None: + text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")] + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + else: + input_ids = tokenizer(text, return_tensors="pt").input_ids attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} if image is not None: diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index edbbce5514..5907517f62 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1883,7 +1883,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "nanollava", "internvl2"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2"] TASK = "image-text-to-text" REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"] @@ -1922,7 +1922,7 @@ def test_compare_to_transformers(self, model_arch): transformers_model.img_context_token_id = img_context_token_id if "nanollava" in model_arch: transformers_model.get_vision_tower().load_model() - inputs = self.gen_inputs(model_arch, prompt, self.IMAGE) + preprocessors = self.get_preprocessors(model_arch) set_seed(SEED) ov_model = OVModelForVisualCausalLM.from_pretrained( model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS @@ -1934,6 +1934,7 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(hasattr(ov_model, additional_part)) self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) + inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600))) # pytorch minicpmv and internvl are not designed to be used via forward if model_arch not in ["minicpmv", "internvl2"]: set_seed(SEED) @@ -1959,7 +1960,8 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) - transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) + with torch.no_grad(): + transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them if model_arch in ["minicpmv", "internvl2"]: @@ -2038,7 +2040,8 @@ def test_generate_utils(self, model_arch): tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) question = "Describe image" - inputs = self.gen_inputs(model_arch, question, self.IMAGE) + preprocessors = self.get_preprocessors(model_arch) + inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600))) # General case outputs = model.generate(**inputs, max_new_tokens=10) outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True) @@ -2046,7 +2049,7 @@ def test_generate_utils(self, model_arch): # No input image case question = "Hi, how are you?" - inputs = self.gen_inputs(model_arch, question, None) + inputs = model.preprocess_inputs(**preprocessors, text=question, image=None) outputs = model.generate(**inputs, max_new_tokens=10) # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 outputs = outputs[:, inputs["input_ids"].shape[1] :] @@ -2056,22 +2059,8 @@ def test_generate_utils(self, model_arch): gc.collect() - def gen_inputs(self, model_arch, base_text_prompt, image=None): + def get_preprocessors(self, model_arch): model_id = MODEL_NAMES[model_arch] - if "llava" in model_arch: - prompt = f"\n {base_text_prompt}" if image is not None else base_text_prompt - elif "minicpmv" in model_arch: - prompt = ( - "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" - if image is not None - else base_text_prompt - ) - elif "internvl2" in model_arch: - prompt = ( - "<|im_start|>user\n\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" - if image is not None - else base_text_prompt - ) if model_arch == "nanollava": config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) processor = AutoProcessor.from_pretrained( @@ -2080,134 +2069,19 @@ def gen_inputs(self, model_arch, base_text_prompt, image=None): tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - image_input = None - if image is not None: - image_input = processor(images=image, return_tensors="pt")["pixel_values"] - text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] - - input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) - else: - input_ids = tokenizer(prompt, return_tensors="pt").input_ids - attention_mask = torch.ones_like(input_ids, dtype=torch.int64) - inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} + preprocessors = {"processor": processor, "tokenizer": tokenizer} elif model_arch == "internvl2": config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - inputs = self.internvl_input_transform(config, tokenizer, prompt, image) + preprocessors = {"processor": None, "tokenizer": tokenizer} else: processor = AutoProcessor.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) - inputs = processor( - images=[image.resize((600, 600))] if image is not None else None, text=[prompt], return_tensors="pt" - ) - return inputs - - def internvl_input_transform(self, config, tokenizer, prompt, image=None): - import torchvision.transforms as T - from torchvision.transforms.functional import InterpolationMode - - IMG_START_TOKEN = "" - IMG_END_TOKEN = "" - IMG_CONTEXT_TOKEN = "" - - IMAGENET_MEAN = (0.485, 0.456, 0.406) - IMAGENET_STD = (0.229, 0.224, 0.225) - - def build_transform(input_size): - MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose( - [ - T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), - T.ToTensor(), - T.Normalize(mean=MEAN, std=STD), - ] - ) - return transform - - def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): - best_ratio_diff = float("inf") - best_ratio = (1, 1) - area = width * height - for ratio in target_ratios: - target_aspect_ratio = ratio[0] / ratio[1] - ratio_diff = abs(aspect_ratio - target_aspect_ratio) - if ratio_diff < best_ratio_diff: - best_ratio_diff = ratio_diff - best_ratio = ratio - elif ratio_diff == best_ratio_diff: - if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: - best_ratio = ratio - return best_ratio - - def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False): - orig_width, orig_height = image.size - aspect_ratio = orig_width / orig_height - - # calculate the existing image aspect ratio - target_ratios = { - (i, j) - for n in range(min_num, max_num + 1) - for i in range(1, n + 1) - for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num - } - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) - - # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, target_ratios, orig_width, orig_height, image_size - ) - - # calculate the target width and height - target_width = image_size * target_aspect_ratio[0] - target_height = image_size * target_aspect_ratio[1] - blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - - # resize the image - resized_img = image.resize((target_width, target_height)) - processed_images = [] - for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size, - ) - # split the image - split_img = resized_img.crop(box) - processed_images.append(split_img) - assert len(processed_images) == blocks - if use_thumbnail and len(processed_images) != 1: - thumbnail_img = image.resize((image_size, image_size)) - processed_images.append(thumbnail_img) - return processed_images - - def load_image(image, input_size=448, max_num=12): - transform = build_transform(input_size=input_size) - images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) - return pixel_values - - if image is not None: - pixel_values = load_image(image, input_size=config.vision_config.image_size) - num_patches = pixel_values.shape[0] - num_image_token = int( - (config.vision_config.image_size // config.vision_config.patch_size) ** 2 - * (config.downsample_ratio**2) - ) - image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN - prompt = prompt.replace("", image_tokens, 1) - text_inputs = tokenizer(prompt, return_tensors="pt") - inputs = dict(text_inputs) - inputs.update({"pixel_values": pixel_values}) - else: - inputs = tokenizer(prompt, return_tensors="pt") - return inputs + preprocessors = {"processor": processor, "tokenizer": None} + return preprocessors class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase): From ed1d3054a089a2507aaac825d4458d7567318509 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 12 Nov 2024 08:16:17 +0400 Subject: [PATCH 08/13] Update optimum/exporters/openvino/utils.py Co-authored-by: Nikita Savelyev --- optimum/exporters/openvino/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 71cfa7db19..7013342099 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -221,7 +221,7 @@ def save_config(config, save_dir): config.save_pretrained(save_dir) except Exception as exp: logger.warning( - f"Attempt to save config using standard API is failed with {exp}. It may be issue with model config, please check its correctness before usage." + f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage." ) save_dir = Path(save_dir) save_dir.mkdir(exist_ok=True, parents=True) From 563f05eeb7cf1109e45e90f07f7dfc23956b57e5 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 12 Nov 2024 08:16:26 +0400 Subject: [PATCH 09/13] Update tests/openvino/test_modeling.py Co-authored-by: Nikita Savelyev --- tests/openvino/test_modeling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 5907517f62..be4efee4af 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2071,7 +2071,6 @@ def get_preprocessors(self, model_arch): ) preprocessors = {"processor": processor, "tokenizer": tokenizer} elif model_arch == "internvl2": - config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS ) From 38bb1a92e88e40d4b28bc2de288405a4045cafef Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 12 Nov 2024 08:30:03 +0400 Subject: [PATCH 10/13] change preprocess_inputs signature --- .../openvino/modeling_visual_language.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 274c3ee0c8..5ebf877515 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -15,6 +15,7 @@ from PIL.Image import Image from transformers import ( AutoConfig, + AutoImageProcessor, GenerationConfig, GenerationMixin, PretrainedConfig, @@ -736,9 +737,9 @@ def can_generate(self): @staticmethod @abstractmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): """ @@ -910,12 +911,14 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values): @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if processor.chat_template is not None: + if processor is None: + raise ValueError("Processor is required.") + if getattr(processor, "chat_template", None) is not None: chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}] if image is not None: chat_prompt[0]["content"].append({"type": "image"}) @@ -1225,9 +1228,9 @@ def merge_vision_text_embeddings( def preprocess_inputs( self, - processor=None, text: str = "", image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): if tokenizer is None: @@ -1558,12 +1561,14 @@ def merge_vision_text_embeddings( @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): - if processor.chat_template is not None: + if processor is None: + raise ValueError("Processor is required.") + if getattr(processor, "chat_template", None) is not None: messages = [{"role": "user", "content": text if image is None else "(./)\n" + text}] prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) else: @@ -1749,13 +1754,15 @@ def get_multimodal_embeddings( @staticmethod def preprocess_inputs( - processor, text: str, image: Optional[Image] = None, + processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, ): if tokenizer is None: raise ValueError("Tokenizer is required.") + if image is not None and processor is None: + raise ValueError("Processor is required.") text_content = f"\n{text}" if image is not None else text messages = [{"role": "user", "content": text_content}] if tokenizer.chat_template is not None: From 2644ce37ac7cebdc98da90ff5f59798fbd6f3093 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 12 Nov 2024 08:53:06 +0400 Subject: [PATCH 11/13] fix quantization after signature update --- optimum/intel/openvino/quantization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 06cc16d043..a84b3e8f4f 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -784,7 +784,9 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig): image = Image.open(requests.get(image_url, stream=True).raw) try: - inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer) + inputs = self.model.preprocess_inputs( + text=instruction, image=image, processor=processor, tokenizer=tokenizer + ) except ValueError as value_error: if "Tokenizer is required." in str(value_error) and tokenizer_error is not None: raise tokenizer_error From 185e0c1a4b0bb765508e74e65a216bb8ad233ac7 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 12 Nov 2024 10:30:23 +0400 Subject: [PATCH 12/13] fix preparing generation config --- .../openvino/modeling_visual_language.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 5ebf877515..8f681a6c26 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1340,6 +1340,45 @@ def load_image(image, input_size=448, max_num=12): inputs = tokenizer(text, return_tensors="pt") return inputs + # internvl has issue with check _get_non_default_parameters, as wrkaraund overide _prepare_generation_config + def _prepare_generation_config( + self, generation_config: Optional[GenerationConfig], **kwargs: Dict + ) -> Tuple[GenerationConfig, Dict]: + using_model_generation_config = False + if generation_config is None: + if ( + self.generation_config._from_model_config # 1) + and self.generation_config._original_object_hash == hash(self.generation_config) # 2) + ): + new_generation_config = GenerationConfig.from_model_config(self.config) + if new_generation_config != self.generation_config: # 4) + warnings.warn( + "You have modified the pretrained model configuration to control generation. This is a" + " deprecated strategy to control generation and will be removed in v5." + " Please use and modify the model generation configuration (see" + " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )", + UserWarning, + ) + self.generation_config = new_generation_config + + generation_config = self.generation_config + using_model_generation_config = True + + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model + if not using_model_generation_config: + if generation_config.bos_token_id is None: + generation_config.bos_token_id = self.generation_config.bos_token_id + if generation_config.eos_token_id is None: + generation_config.eos_token_id = self.generation_config.eos_token_id + if generation_config.pad_token_id is None: + generation_config.pad_token_id = self.generation_config.pad_token_id + if generation_config.decoder_start_token_id is None: + generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id + + return generation_config, model_kwargs + class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM): additional_parts = ["resampler"] From ad1687eb2bd79cd6fa2ff1fc605c9faa76ae983e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 12 Nov 2024 13:18:44 +0400 Subject: [PATCH 13/13] Update optimum/intel/openvino/modeling_visual_language.py Co-authored-by: Nikita Savelyev --- optimum/intel/openvino/modeling_visual_language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8f681a6c26..b7bf96a0de 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1228,7 +1228,7 @@ def merge_vision_text_embeddings( def preprocess_inputs( self, - text: str = "", + text: str, image: Optional[Image] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None,